# Detect duplicate genomes
#' @name detect_duplicate_genomes
#' @title Compute pairwise genome similarity or distance between individuals
#' to highligh potential duplicate individuals
#' @description The function can compute two methods
#' to highligh potential duplicate individuals.
#' \enumerate{
#' \item distance between individuals and/or
#' \item pairwise genome similarity
#' }
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @param detect.duplicate.genomes (optional, logical) For use inside radiator pipelines.
#' Default: \code{detect.duplicate.genomes = TRUE}.
# @param subsample.markers (optional, integer) To speed up computation and rapidly
# test the function's arguments (e.g. using 200 markers).
# Default: \code{subsample.markers = NULL}.
# @param random.seed (integer, optional) For reproducibility, set an integer
# for randomness when argument \code{subsample.markers} is used.
# By default, a random number is generated and printed.
# Default: \code{random.seed = NULL}.
#' @param distance.method (character) Depending on input data, 2 different methods
#' are used (give similar results):
#' \itemize{
#' \item gds data The calculation is fast it's \code{SNPRelate::snpgdsIBS} under
#' the hood.
#' \item tidy data The distance measure uses \code{amap::Dist}.
#' This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary".
#' }
#' Using \code{distance.method = NULL} will not run this method.
#' Default: \code{distance.method = "manhattan"}. This is very fast
#' compared to the genome similarity method. It uses allele counts and the codes
#' are tailored for biallelic and multiallelic markers.
#' @param genome (logical) Computes pairwise genome similarity in parallel.
#' The proportion of the shared genotypes is averaged across shared markers between
#' each pairwise comparison. This method makes filtering easier because the
#' threshold is more intuitive with the plots produced, but it's much longer
#' to run, even in parallel, so better to run overnight.
#' Default: \code{genome = FALSE}.
#' @param threshold.common.markers (double, optional) When using the
#' pairwise genome similarity approach (\code{genome = TRUE}), using a threshold
#' will filter the pairwise comparison before generating the results. This is
#' usefull if the number of pairwise comparisons (n*(n-1)/2) is very large and
#' allows to reduce the number of false positive, when missing data is involved.
#' e.g. 2 samples might have 100% markers in common, but if they only have
#' 30% of their genotyped markers in common, this is a poor fit.
#' Default: \code{threshold.common.markers = NULL}.
#'
#' @param dup.threshold Default: \code{dup.threshold = 0}. Turn off filter.
#' @param blacklist.duplicates (optional, logical)
#' With \code{blacklist.duplicates = TRUE}, after visualization,
#' the user is asked to enter a threshold to filter out duplicates.
#' With default, \code{blacklist.duplicates = FALSE}, the function as no
#' interaction with user.
#' @inheritParams radiator_common_arguments
#' @return A list with potentially 8 objects:
#' \itemize{
#' \item \code{$distance }: results of the distance method.
#' \item \code{$distance.stats}: Summary statistics of the distance method.
#' \item \code{$pairwise.genome.similarity}: results of the genome method.
#' \item \code{$genome.stats}: Summary statistics of the genome method.
#' \item \code{$violin.plot.distance}: violin plot showing the distribution of pairwise distances.
#' \item \code{$manhattan.plot.distance}: same info different visual with manhattan plot.
#' \item \code{$violin.plot.genome}: violin plot showing the distribution of pairwise genome similarities.
#' \item \code{$manhattan.plot.genome}: same info different visual with manhattan plot.
#' \item \code{$blacklist.id.similar}: blacklisted duplicates.
#' }
#'
#' Saved in the working directory:
#' individuals.pairwise.dist.tsv, individuals.pairwise.distance.stats.tsv,
#' individuals.pairwise.genome.similarity.tsv, individuals.pairwise.genome.stats.tsv,
#' blackliste.id.similar.tsv, blacklist.pairs.threshold.tsv
#' @details
#' Strategically, run the default first (\code{distance.method},
#' no \code{genome})
#'
#' \strong{\code{distance.method} argument is fast, but...}
#'
#' you don't know if the observed comparison (close or distant)
#' is influenced by missing values/the number of markers in common
#' between the pair compared. This is something that needs to be considered.
#' Be suspicious of a \emph{distant outlier} from the same pop pairwise comparison,
#' and similarly, be suspicious of a \emph{close outlier} from a different pop
#' pairwise comparisons.
#'
#' If there is no outlier, don't bother running the function again with
#' (\code{genome = TRUE}).
#'
#'\strong{Relative distance}
#'
#' Is the normalized distance for your dataset (not calculated by strata). For
#' each individual, it's the distance divided by the maximum distance observed.
#' The range is limited between 0 and 1. Closer to 0 = the more similar and
#' closer to 1, the more distant.
#'
#' \strong{\code{genome = TRUE}}
#'
#' The function will run slower, but...
#' If you see outliers with the first run, take the time to run the function
#' with \code{genome = TRUE}. Because this option is much better at detecting
#' duplicated individuals and it also shows the impact of \strong{missingness}
#' or the number of \strong{shared markers} between comparisons.
#'
#' \emph{Your outlier duo could well be the result of one of the individual having
#' an extremely low number of genotypes...}
#' @export
#' @rdname detect_duplicate_genomes
#' @examples
#' \dontrun{
#' # First run and simplest way (if you have the tidy tibble):
#' dup <- radiator::detect_duplicate_genomes(data = "wombat_tidy.rad")
#'
#' # This will use by default:
#' # distance.method = "manhattan"
#' # genome = FALSE
#' # parallel.core = all my CPUs - 1
#'
#' # If you need a tidy tibble: use one of radiator \code{tidy_} function or
#' # \code{radiator::tidy_genomic_data}
#'
#'
#' # To view the manhattan plot:
#' dup$manhattan.plot.distance
#'
#' # to view the data stats
#' dup.data.stats <- dup$distance.stats
#'
#' # to view the data
#' dup.data <- dup$distance
#'
#' # Based on the look of the distribution using both manhattan and boxplot,
#' # I can filter the dataset to highlight potential duplicates.
#'
#' # To run the distance (with euclidean distance instead of the default manhattan,
#' # and also carry the second analysis (with the genome method):
#' dup <- radiator::tidy_genomic_data(
#' data = wombat_tidy_object,
#' strata = "wombat.strata.tsv",
#' vcf.metadata = FALSE
#' ) %>%
#' radiator::detect_duplicate_genomes(
#' data = .,
#' distance.method = "euclidean",
#' genome = TRUE
#' )
#'
#' # to view the data of the genome data
#' dup.data <- dup$pairwise.genome.similarity
#'
#' # Based on the look of the distribution using both manhattan and boxplot,
#' # I can filter the dataset based on 98% of identical genotype proportion,
#' # to highlight potential duplicates:
#' dup.filtered <- dplyr::filter(.data = dup.data, PROP_IDENTICAL > 0.98)
#'
#' # Get the list of duplicates id
#' dup.list.names <- data.frame(INDIVIDUALS = unique(c(dup.filtered$ID1, dup.filtered$ID2)))
#' }
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
detect_duplicate_genomes <- function(
data,
interactive.filter = TRUE,
detect.duplicate.genomes = TRUE,
dup.threshold = 0,
# subsample.markers = NULL,
# random.seed = NULL,
distance.method = "manhattan",
genome = FALSE,
threshold.common.markers = NULL,
blacklist.duplicates = FALSE,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
) {
# # Testing
# interactive.filter = TRUE
# detect.duplicate.genomes = TRUE
# dup.threshold = 0
# distance.method = "manhattan"
# genome = FALSE
# threshold.common.markers = NULL
# blacklist.duplicates = FALSE
# parallel.core = parallel::detectCores() - 1
# verbose = TRUE
# random.seed = NULL
# path.folder = NULL
# parameters = NULL
# internal <- FALSE
if (interactive.filter || detect.duplicate.genomes) {
if (interactive.filter) {
verbose <- TRUE
blacklist.duplicates <- TRUE
}
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "detect_duplicate_genomes", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
on.exit(radiator_function_header(f.name = "detect_duplicate_genomes", start = FALSE, verbose = verbose), add = TRUE)
res <- list() # New list to prepare for results
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters", "random.seed", "internal"),
verbose = FALSE
)
# Manage missing arguments ---------------------------------------------------
if (missing(data)) rlang::abort("missing data argument")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "detect_duplicate_genomes",
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
message("Function call and arguments stored in a file")
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join("radiator_detect_duplicate_genomes_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
verbose = verbose
)
# Random seed ----------------------------------------------------------------
if (is.null(random.seed)) random.seed <- sample(x = 1:1000000, size = 1)
set.seed(random.seed)
readr::write_lines(x = random.seed, file = file.path(path.folder, "random.seed"))
if (verbose) message("File written: random.seed (", random.seed,")")
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "amap")
# Tidy data
data <- radiator::tidy_wide(data = data, import.metadata = FALSE)
# Filter parameter file: generate and initiate
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
if (rlang::has_name(data, "GT_BIN")) {
gt.field <- "GT_BIN"
} else {
gt.field <- "GT"
}
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "STRATA", "INDIVIDUALS", gt.field)#, "REF", "ALT")
data %<>% dplyr::select(tidyselect::any_of(want))
# necessary steps to make sure we work with unique markers and not duplicated LOCUS
if (rlang::has_name(data, "LOCUS") && !rlang::has_name(data, "MARKERS")) {
data <- dplyr::rename(.data = data, MARKERS = LOCUS)
}
# Subsampling ----------------------------------------------------------------
# if (!is.null(subsample.markers)) {
# # Set seed for random sampling
# if (is.null(random.seed)) {
# random.seed <- sample(x = 1:1000000, size = 1)
# set.seed(random.seed)
# message("Random seed used: ", random.seed)
# } else {
# set.seed(random.seed)
# }
# sample.markers <- dplyr::distinct(data, MARKERS) %>%
# dplyr::sample_n(tbl = ., size = subsample.markers) %>%
# readr::write_tsv(x = ., file = file.path(path.folder, stringi::stri_join("subsampled.markers_random.seed_", random.seed, ".tsv"))) %>%
# purrr::flatten_chr(.)
# data <- dplyr::filter(data, MARKERS %in% sample.markers)
# sample.markers <- NULL
# }
# strata
strata <- dplyr::ungroup(data) %>%
dplyr::distinct(STRATA, INDIVIDUALS)
# Preparing data for comparisons ---------------------------------------------
if (verbose) message("Preparing data for analysis")
#Genotyped stats -------------------------------------------------------------
n.markers <- dplyr::n_distinct(data$MARKERS)
if (gt.field == "GT") {
geno.stats <- dplyr::filter(data, GT != "000000") %>%
dplyr::group_by(INDIVIDUALS) %>%
dplyr::summarise(GENOTYPED_PROP = length(GT) / n.markers)
} else {
geno.stats <- dplyr::filter(data, !is.na(GT_BIN)) %>%
dplyr::group_by(INDIVIDUALS) %>%
dplyr::summarise(GENOTYPED_PROP = length(GT_BIN) / n.markers)
}
readr::write_tsv(
x = geno.stats,
file = file.path(path.folder, "genotyped.statistics.tsv"))
# GT_BIN available
# for biallelic data set, just need to keep
if (!is.null(distance.method) & gt.field == "GT_BIN") {
# input.prep <- dplyr::ungroup(data) %>%
# dplyr::select(MARKERS, INDIVIDUALS, n = GT_BIN) %>%
# dplyr::mutate(MARKERS_ALLELES = MARKERS) %>%
# dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)
input.prep <- dplyr::ungroup(data) %>%
dplyr::select(MARKERS, INDIVIDUALS, ALT = GT_BIN) %>%
dplyr::mutate(
REF = 2 - ALT,
REF = as.integer(REF),
ALT = as.integer(ALT)
) %>%
rad_long(
x = .,
cols = c("MARKERS", "INDIVIDUALS"),
names_to = "ALLELES",
values_to = "n",
variable_factor = FALSE
) %>%
dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>%
dplyr::select(-ALLELES) %>%
dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)
}
# GT_BIN NOT available
if (gt.field != "GT_BIN") {
# Allele count
missing.geno <- dplyr::select(.data = data, MARKERS, INDIVIDUALS, GT) %>%
dplyr::filter(GT == "000000") %>%
dplyr::select(-GT) %>%
dplyr::mutate(MISSING = rep("blacklist", n()))
if (verbose) message("Preparing data: calculating allele count")
input.prep <- dplyr::select(data, MARKERS, INDIVIDUALS, GT) %>%
radiator_future(
.x = .,
.f = allele_count,
flat.future = "dfr",
split.vec = TRUE,
split.with = "MARKERS",
split.chunks = 10L,
parallel.core = parallel.core
)
if (nrow(missing.geno) > 0) {
input.prep %<>% dplyr::anti_join(missing.geno, by = c("MARKERS", "INDIVIDUALS"))
}
input.prep %<>%
dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, GT, sep = ".")) %>%
dplyr::select(-GT) %>%
dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)
missing.geno <- NULL # unused object
}#end preparing data
} else {
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
# Filter parameter file: generate and initiate
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
verbose = verbose)
# geno.stats
geno.stats <- generate_stats(
gds = data, # change to data...
markers = FALSE,
missing = TRUE,
heterozygosity = FALSE,
coverage = FALSE,
allele.coverage = FALSE,
exhaustive = FALSE,
plot = FALSE,
file.date = file.date,
parallel.core = parallel.core,
verbose = FALSE,
path.folder = path.folder) %$% i.info %>%
dplyr::mutate(GENOTYPED_PROP = 1 - MISSING_PROP, MISSING_PROP = NULL) #%>% dplyr::rename(POP_ID = STRATA)
strata <- dplyr::distinct(geno.stats, INDIVIDUALS, STRATA)
geno.stats %<>% dplyr::select(INDIVIDUALS, GENOTYPED_PROP )
readr::write_tsv(
x = geno.stats,
file = file.path(path.folder, "genotyped.statistics.tsv"))
genome <- FALSE
}
# Computing distance ---------------------------------------------------------
if (!is.null(distance.method)) {
message("Calculating ", distance.method, " distances between individuals...")
res$distance <- distance_individuals(
x = if (data.type == "tbl_df") {
input.prep
} else {
data
},
strata = strata,
distance.method = distance.method,
parallel.core = parallel.core
) %>%
dplyr::arrange(DISTANCE_RELATIVE) %>%
dplyr::mutate(PAIRS = seq(from = 1, to = n(), by = 1)) %>%
dplyr::arrange(PAIRS)
input.prep <- NULL
res$distance <- suppressWarnings(
dplyr::bind_cols(
res$distance,
dplyr::select(res$distance, ID1, ID2, PAIRS) %>%
dplyr::left_join(dplyr::rename(geno.stats, ID1 = INDIVIDUALS, ID1_G = GENOTYPED_PROP), by = "ID1") %>%
dplyr::left_join(dplyr::rename(geno.stats, ID2 = INDIVIDUALS, ID2_G = GENOTYPED_PROP), by = "ID2") %>%
rad_long(
x = .,
cols = c("ID1", "ID2", "PAIRS"),
names_to = "GENOTYPED_MAX",
values_to = "GENOTYPED_PROP",
variable_factor = FALSE
) %>%
dplyr::group_by(PAIRS) %>%
dplyr::filter(GENOTYPED_PROP == max(GENOTYPED_PROP)) %>%
dplyr::ungroup(.) %>%
dplyr::distinct(PAIRS, .keep_all = TRUE) %>%
dplyr::select(-GENOTYPED_PROP) %>%
dplyr::mutate(GENOTYPED_MAX = dplyr::if_else(GENOTYPED_MAX == "ID1_G", ID1, ID2)) %>%
dplyr::arrange(PAIRS) %>%
dplyr::select(-ID1, -ID2, -PAIRS)
)
)
# test <- res$distance
readr::write_tsv(
x = res$distance,
file = file.path(path.folder, "individuals.pairwise.dist.tsv"),
col_names = TRUE
)
# Stats
message("Generating summary statistics")
res$distance.stats <- res$distance %>%
dplyr::summarise(
MEAN = mean(DISTANCE_RELATIVE, na.rm = TRUE),
MEDIAN = stats::median(DISTANCE_RELATIVE, na.rm = TRUE),
SE = round(sqrt(stats::var(DISTANCE_RELATIVE, na.rm = TRUE)/length(stats::na.omit(DISTANCE_RELATIVE))), 2),
MIN = round(min(DISTANCE_RELATIVE, na.rm = TRUE), 2),
MAX = round(max(DISTANCE_RELATIVE, na.rm = TRUE), 2),
QUANTILE25 = stats::quantile(DISTANCE_RELATIVE, 0.25), # quantile25
QUANTILE75 = stats::quantile(DISTANCE_RELATIVE, 0.75)#, # quantile75
# OUTLIERS_LOW = QUANTILE25 - (1.5 * (QUANTILE75 - QUANTILE25)), # outliers : below the outlier boxplot
# OUTLIERS_HIGH = QUANTILE75 + (1.5 * (QUANTILE75 - QUANTILE25)) # outliers : higher the outlier boxplot
) %>%
readr::write_tsv(
x = .,
file = file.path(path.folder, "individuals.pairwise.distance.stats.tsv"),
col_names = TRUE
)
message("Generating plots")
# violin plot
res$violin.plot.distance <- ggplot2::ggplot(
data = res$distance,
ggplot2::aes(x = PAIRWISE, y = DISTANCE_RELATIVE, na.rm = TRUE)
) +
ggplot2::geom_violin(trim = TRUE) +
ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = "black") +
ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
ggplot2::labs(y = "Distance (relative)\n <- distant close->") +
ggplot2::labs(x = "Pairwise comparisons") +
ggplot2::scale_y_reverse() +
ggplot2::theme(
# legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
# panel.grid.major.y = element_blank(),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
)
ggplot2::ggsave(
filename = file.path(path.folder, "violin.plot.distance.pdf"),
plot = res$violin.plot.distance,
width = 20,
height = 15,
dpi = 200,
units = "cm",
useDingbats = FALSE
)
ggplot2::ggsave(
filename = file.path(path.folder, "violin.plot.distance.png"),
plot = res$violin.plot.distance,
width = 20,
height = 15,
dpi = 200,
units = "cm"
)
# Manhattan plot
res$manhattan.plot.distance <- ggplot2::ggplot(
data = res$distance,
ggplot2::aes(x = PAIRWISE, y = DISTANCE_RELATIVE, colour = POP_COMP)
) +
ggplot2::geom_jitter(alpha = 0.3) +
ggplot2::labs(y = "Distance (relative)\n <- distant close->") +
ggplot2::labs(x = "Pairwise comparisons") +
ggplot2::labs(colour = "Population comparisons") +
ggplot2::scale_colour_manual(values = c("#0571b0", "black")) +
ggplot2::scale_y_reverse() +
ggplot2::theme(
# legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
# panel.grid.major.y = element_blank(),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
) +
ggplot2::theme_light()
ggplot2::ggsave(
filename = file.path(path.folder, "manhattan.plot.distance.pdf"),
plot = res$manhattan.plot.distance,
width = 20,
height = 15,
dpi = 200,
units = "cm",
useDingbats = FALSE
)
ggplot2::ggsave(
filename = file.path(path.folder, "manhattan.plot.distance.png"),
plot = res$manhattan.plot.distance,
width = 20,
height = 15,
dpi = 200,
units = "cm"
)
} # end distance method
# Compute genome similarity -------------------------------------------------
if (genome) {
# If GT_BIN available, we need a new input.prep (not the same as dist method)
if (gt.field == "GT_BIN") {
input.prep <- dplyr::filter(.data = data, !is.na(GT_BIN))
}
# all combination of individual pair
all.pairs <- utils::combn(unique(input.prep$INDIVIDUALS), 2, simplify = TRUE) %>%
t %>%
tibble::as_tibble(.) %>%
magrittr::set_colnames(x = ., value = c("ID1", "ID2")) %>%
dplyr::mutate(
PAIRS = seq(from = 1, to = n(), by = 1),
MARKERS_TOTAL = rep(n.markers, n()) # just n.markers works
)
# get the number of pairwise comp.
number.pairwise <- nrow(all.pairs)
message("Starting scan for duplicates")
message("Pairwise comparisons: ", number.pairwise)
if (!is.null(threshold.common.markers)) {
message("Threshold to keep a pair: ", round(threshold.common.markers, 2))
}
keep.data <- TRUE
if (number.pairwise > 100000) {
if (verbose) message(" Time for coffee...")
keep.data <- FALSE
}
# generate the file to print into
pairwise.filename <- file.path(path.folder, "individuals.pairwise.genome.similarity.tsv")
pairwise.genome.similarity <- tibble::tibble(
PAIRS = as.numeric(),
ID1 = as.character(),
ID2 = as.character(),
ID1_STRATA = as.character(),
ID2_STRATA = as.character(),
POP_COMP = as.character(),
MARKERS_TOTAL = as.integer(),
MARKERS_COMMON = as.integer(),
MARKERS_COMMON_PROP = as.numeric(),
MARKERS_MISSING = as.integer(),
IDENTICAL = as.integer(),
DIFFERENT = as.integer(),
IDENTICAL_PROP = as.numeric(),
IDENTICAL_PROP_TOTAL = as.numeric(),
GENOTYPED_MIN = as.character(),
GENOTYPED_MAX = as.character(),
PAIRWISE = as.character(),
METHOD = as.character()
) %>%
readr::write_tsv(
x = .,
file = pairwise.filename,
col_names = TRUE)
blacklist.pairs.filename <- file.path(path.folder, "blacklist.pairs.threshold.tsv")
blacklist.pairs <- tibble::tibble(
PAIRS = as.numeric(),
ID1 = as.character(),
ID2 = as.character()) %>%
readr::write_tsv(
x = .,
file = blacklist.pairs.filename,
col_names = TRUE)
# number.pairwise <- 10
# number.pairwise <- 1000
# number.pairwise <- 2000
# number.pairwise <- 5000
# number.pairwise <- 20000
# parallel.core <- 10
# Optimizing cpu usage
round.cpu <- 1
if (number.pairwise > 200 * parallel.core) {
round.cpu <- floor(number.pairwise / (200 * parallel.core))
}
res$pairwise.genome.similarity <- radiator_future(
.x = all.pairs,
.f = genome_similarity,
flat.future = "dfr",
split.vec = TRUE,
split.with = NULL,
split.chunks = parallel.core * round.cpu,
parallel.core = parallel.core,
data = input.prep,
threshold.common.markers = threshold.common.markers,
keep.data = keep.data,
pairwise.filename = pairwise.filename,
blacklist.pairs.filename = blacklist.pairs.filename
)
# as.integer is usually twice as light as numeric vector...
# all.pairs$SPLIT_VEC <- as.integer(floor((parallel.core * round.cpu * (seq_len(number.pairwise) - 1) / number.pairwise) + 1))
# res$pairwise.genome.similarity <- radiator_parallel(
# X = unique(all.pairs$SPLIT_VEC),
# FUN = genome_similarity,
# mc.preschedule = FALSE,
# mc.silent = FALSE,
# mc.cleanup = TRUE,
# mc.cores = parallel.core,
# all.pairs = all.pairs,
# data = input.prep,
# threshold.common.markers = threshold.common.markers,
# keep.data = keep.data,
# pairwise.filename = pairwise.filename,
# blacklist.pairs.filename = blacklist.pairs.filename
# ) %>%
# dplyr::bind_rows(.)
input.prep <- id.pairwise <- NULL # no longer needed
if (verbose) message("Generating summary statistics")
if (nrow(res$pairwise.genome.similarity) == 0) {
data.import <- readr::read_tsv(file = pairwise.filename, col_types = "d____c___i__d___c_")
new.pairwise <- nrow(data.import)
res$genome.stats <- data.import %>%
dplyr::summarise(
MEAN = mean(IDENTICAL_PROP, na.rm = TRUE),
MEDIAN = stats::median(IDENTICAL_PROP, na.rm = TRUE),
SE = round(sqrt(stats::var(IDENTICAL_PROP, na.rm = TRUE)/length(stats::na.omit(IDENTICAL_PROP))), 2),
MIN = round(min(IDENTICAL_PROP, na.rm = TRUE), 2),
MAX = round(max(IDENTICAL_PROP, na.rm = TRUE), 2),
QUANTILE25 = stats::quantile(IDENTICAL_PROP, 0.25), # quantile25
QUANTILE75 = stats::quantile(IDENTICAL_PROP, 0.75)# quantile75
)
} else {
new.pairwise <- nrow(res$pairwise.genome.similarity)
# Stats
res$genome.stats <- res$pairwise.genome.similarity%>%
dplyr::summarise(
MEAN = mean(IDENTICAL_PROP, na.rm = TRUE),
MEDIAN = stats::median(IDENTICAL_PROP, na.rm = TRUE),
SE = round(sqrt(stats::var(IDENTICAL_PROP, na.rm = TRUE)/length(stats::na.omit(IDENTICAL_PROP))), 2),
MIN = round(min(IDENTICAL_PROP, na.rm = TRUE), 2),
MAX = round(max(IDENTICAL_PROP, na.rm = TRUE), 2),
QUANTILE25 = stats::quantile(IDENTICAL_PROP, 0.25), # quantile25
QUANTILE75 = stats::quantile(IDENTICAL_PROP, 0.75)# quantile75
)
}
discarded.pairs <- number.pairwise - new.pairwise
if (discarded.pairs > 0) {
message("Number of discarded pairs based on threshold: ", round(discarded.pairs, 2))
message(" more details in: blacklist.pairs.threshold.tsv\n")
} else {
file.remove(blacklist.pairs.filename)
}
readr::write_tsv(
x = res$genome.stats,
file = file.path(path.folder, "individuals.pairwise.genome.stats.tsv"),
col_names = TRUE
)
# Visualization ------------------------------------------------------------
message("Generating the plots")
if (nrow(res$pairwise.genome.similarity) == 0) {
res$violin.plot.genome <- ggplot2::ggplot(
data = data.import,
ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, na.rm = TRUE))
res$manhattan.plot.genome <- ggplot2::ggplot(
data = data.import,
ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, colour = POP_COMP,
size = MARKERS_MISSING))
} else {
res$violin.plot.genome <- ggplot2::ggplot(
data = res$pairwise.genome.similarity,
ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, na.rm = TRUE))
res$manhattan.plot.genome <- ggplot2::ggplot(
data = res$pairwise.genome.similarity,
ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, colour = POP_COMP,
size = MARKERS_MISSING))
}
# violin plot
res$violin.plot.genome <- res$violin.plot.genome +
ggplot2::geom_violin(trim = TRUE) +
ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = "black") +
ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
ggplot2::labs(y = "Genome similarity (proportion)") +
ggplot2::labs(x = "Pairwise comparison") +
ggplot2::theme(
# legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
# panel.grid.major.y = element_blank(),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
)
ggplot2::ggsave(
filename = file.path(path.folder, "violin.plot.genome.pdf"),
plot = res$violin.plot.genome,
width = 20, height = 15, dpi = 150, units = "cm", useDingbats = FALSE)
# Manhattan plot
res$manhattan.plot.genome <- res$manhattan.plot.genome +
ggplot2::geom_jitter(alpha = 0.3) +
ggplot2::labs(y = "Genome similarity (proportion)") +
ggplot2::labs(x = "Pairwise comparisons") +
ggplot2::labs(colour = "Population comparisons") +
ggplot2::scale_colour_manual(values = c("#0571b0", "black")) +
ggplot2::scale_size_area(name = "Markers missing", max_size = 6) +
ggplot2::theme(
# legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
# panel.grid.major.y = element_blank(),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
) +
ggplot2::theme_light()
ggplot2::ggsave(
filename = file.path(path.folder, "manhattan.plot.genome.pdf"),
plot = res$manhattan.plot.genome,
width = 20, height = 15, dpi = 150, units = "cm", useDingbats = FALSE)
ggplot2::ggsave(
filename = file.path(path.folder, "manhattan.plot.genome.png"),
plot = res$manhattan.plot.genome,
width = 20, height = 15, dpi = 200, units = "cm")
} # end genome method
# Removing duplicates ------------------------------------------------------
if (blacklist.duplicates || interactive.filter) {
check.mono <- FALSE
message("\nInspect tables and figures to decide if some individual(s) need to be blacklisted")
remove.id <- radiator_question(
x = " Do you need to blacklist individual(s) (y/n): ", answer.opt = c("y", "n"))
if (remove.id == "y") {
message("\n2 options to remove duplicates:")
message(" 1. threshold: using the figure you choose a threshold. It's more powerful to fully remove duplicates")
message(" 2. manually: the function generate a blacklist that you have to complete")
message(" Note: not sure ? Use option 1, it's more powerful to fully remove duplicates")
remove.dup <- radiator_question(
x = " Enter the option to remove duplicates (1/2): ", minmax = c(1,2))
if (remove.dup == 2) {
readr::write_tsv(
x = tibble::tibble(INDIVIDUALS = as.character()),
file = file.path(path.folder, "blacklist.id.similar.tsv"),
append = FALSE, col_names = TRUE)
message(" An empty blacklist file was generated: blacklist.id.similar.tsv")
message(" Keep column name, just add the individual(s) to blacklist(s)")
blacklist.id.similar <- "check blacklist.id.similar.tsv file"
finished <- radiator_question(
x = " When finished filling the blacklist type `y`:",
answer.opt = c("y", "Y", "yes", "Yes", "YES"))
blacklist.id.similar <- readr::read_tsv(
file = file.path(path.folder, "blacklist.id.similar.tsv"),
col_types = "c")
dup.threshold <- "blacklist generated manually"
} else {# with threshold
# message("Use the distance or genome analysis to blacklist duplicates ? (distance/genome): ")
# analysis <- as.character(readLines(n = 1))
if (data.type == "SeqVarGDSClass") {
analysis <- "distance"
} else {
if (!is.null(distance.method) && genome) {
analysis <- radiator_question(
x = "\nChoose the analysis method to blacklist duplicates? (distance/genome): ",
answer.opt = c("distance", "genome")
)
} else if (genome) {
analysis <- "genome"
} else {
analysis <- "distance"
}
}
if (analysis == "distance") {
data.dup <- "individuals.pairwise.dist.tsv"
} else {
data.dup <- "individuals.pairwise.genome.similarity.tsv"
}
dup.threshold <- radiator_question(
x = "\nEnter the threshold to remove duplicates: (between 0 and 1)", minmax = c(0, 1))
message("\n2 options to remove duplicates involved in pairs from different strata/group:")
message(" (the black points on the figure, above your threshold)")
message(" 1: blacklist both samples in the pair")
message(" 2: blacklist only 1 sample, based on missingness")
diff.pop.remove <- radiator_question(
x = " Enter 1/2: ", minmax = c(1, 2))
if (diff.pop.remove == 2) {
diff.pop.remove <- FALSE
} else {
diff.pop.remove <- TRUE
}
blacklist.id.similar <- remove_duplicates(
data = data.dup,
stats = "genotyped.statistics.tsv",
dup.threshold = dup.threshold,
diff.pop.remove = diff.pop.remove,
path.folder = path.folder) %$% blacklist.id
}
n.ind.blacklisted <- nrow(blacklist.id.similar)
} else {
dup.threshold <- 0L
n.ind.blacklisted <- 0L
}
if (n.ind.blacklisted > 0) {
check.mono <- TRUE
if (verbose) message("Blacklisted individuals: ", n.ind.blacklisted, " ind.")
if (verbose) message(" Filtering with blacklist of individuals")
if (data.type == "tbl_df") {
data %<>% dplyr::filter(!INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS)
} else {
id.info <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS,
"filter.individuals.duplicates", FILTERS
)
)
update_radiator_gds(gds = data, node.name = "individuals.meta", value = id.info, sync = TRUE)
# id.info <- extract_individuals_metadata(gds = data)
# bl <- id.info %>% dplyr::filter(INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS) %>%
# dplyr::distinct(INDIVIDUALS) %>%
# dplyr::mutate(FILTER = "filter.individuals.duplicates")
# bl.i <- update_bl_individuals(gds = data, update = bl)
# id.info %<>% dplyr::filter(!INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS)
}
blacklist.id.similar <- NULL
}
} else {
dup.threshold <- 0L
n.ind.blacklisted <- 0L
check.mono <- FALSE
}# End blacklist.duplicates
# updating parameters --------------------------------------------------
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "detect duplicate genomes",
param.name = "dup.threshold",
values = dup.threshold,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("Detect duplicate genomes: ", dup.threshold),
filters.parameters,
internal,
verbose
)
# MONOMORPHIC MARKERS --------------------------------------------------
if (check.mono) {
data <- filter_monomorphic(
data = data,
parallel.core = parallel.core,
verbose = FALSE,
parameters = filters.parameters,
path.folder = path.folder,
internal = FALSE)
}
}
return(data)
}# end function detect_duplicate_genomes
# Internal nested functions: ---------------------------------------------------
# distance method --------------------------------------------------------------
#' @title Distance individuals
#' @description distance method
#' @rdname distance_individuals
#' @export
#' @keywords internal
distance_individuals <- function(
x,
strata = NULL,
distance.method = "manhattan",
parallel.core = parallel::detectCores() - 1
) {
data.type <- radiator::detect_genomic_format(x)
if (data.type == "tbl_df") {
x <- dplyr::select(x, -MARKERS)
n.ind <- dplyr::n_distinct(x$INDIVIDUALS)
x <- suppressWarnings(
dplyr::ungroup(x) %>%
radiator::rad_wide(
x = .,
formula = "INDIVIDUALS ~ MARKERS_ALLELES",
values_from = "n"
) %>%
tibble::remove_rownames(.data = .) %>%
tibble::column_to_rownames(.data = ., var = "INDIVIDUALS"))
x <- suppressWarnings(
amap::Dist(
x = x,
method = distance.method,
nbproc = parallel.core)) %>%
distance2tibble(.)# melt the dist matrice into a data frame
} else {
sample.id <- extract_individuals_metadata(
gds = x,
ind.field.select = "INDIVIDUALS",
whitelist = TRUE) %$%
INDIVIDUALS
# x.bk <- x
x <- SNPRelate::snpgdsIBS(
gdsobj = x,
autosome.only = FALSE,
num.thread = parallel.core,
remove.monosnp = TRUE,
snp.id = extract_markers_metadata(
gds = x, markers.meta.select = "VARIANT_ID", whitelist = TRUE) %$% VARIANT_ID,
sample.id = sample.id,
verbose = FALSE) %$% ibs
# summary_gds(gds = x.bk)
x <- 1 - x
# magrittr::subtract(1) %>%
x %<>%
magrittr::set_colnames(sample.id) %>%
magrittr::set_rownames(sample.id) %>%
distance2tibble(.)
sample.id <- NULL
distance.method <- "IBS"
}
# Include population info with strata
ID1.pop <- suppressWarnings(
dplyr::select(.data = x, INDIVIDUALS = ID1) %>%
dplyr::inner_join(strata, by = "INDIVIDUALS") %>%
dplyr::select(ID1_POP = STRATA))
ID2.pop <- suppressWarnings(
dplyr::select(.data = x, INDIVIDUALS = ID2) %>%
dplyr::inner_join(strata, by = "INDIVIDUALS") %>%
dplyr::select(ID2_POP = STRATA))
x <- dplyr::bind_cols(x, ID1.pop, ID2.pop) %>%
dplyr::mutate(
POP_COMP = dplyr::if_else(ID1_POP == ID2_POP, "same pop", "different pop"),
POP_COMP = factor(POP_COMP, levels = c("same pop", "different pop"), ordered = TRUE),
PAIRWISE = rep("pairwise", n()),
METHOD = rep(distance.method, n())
)
ID1.pop <- ID2.pop <- NULL
return(x)
}#End distance_individuals
# pairwise genome similarity method---------------------------------------------
#' @title genome_similarity
#' @description for the parallel part
#' @rdname genome_similarity
#' @export
#' @keywords internal
genome_similarity <- carrier::crate(function(
all.pairs,
data = NULL,
threshold.common.markers = NULL,
keep.data = TRUE,
pairwise.filename = NULL,
blacklist.pairs.filename = NULL
) {
`%>%` <- magrittr::`%>%`
`%<>%` <- magrittr::`%<>%`
# split.vec <- 20 # test
# all.pairs %<>%
# dplyr::filter(SPLIT_VEC == split.vec) %>%
# dplyr::select(-SPLIT_VEC)
genome_similarity_map <- function(
pair,
some.pairs = NULL,
data = NULL,
threshold.common.markers = NULL
) {
`%>%` <- magrittr::`%>%`
`%<>%` <- magrittr::`%<>%`
# pair <- 1#test
res <- list()
res$genome.comparison <- dplyr::filter(some.pairs, PAIRS == pair) %>%
dplyr::mutate(
ID1_STRATA = unique(data$STRATA[data$INDIVIDUALS == ID1]),
ID2_STRATA = unique(data$STRATA[data$INDIVIDUALS == ID2])
)
if (rlang::has_name(data, "GT_BIN")) {
# genotypes & markers
id1.data <- dplyr::filter(.data = data, INDIVIDUALS %in% res$genome.comparison$ID1) %>%
dplyr::select(-INDIVIDUALS, -STRATA) %>% dplyr::arrange(MARKERS)
id2.data <- dplyr::filter(.data = data, INDIVIDUALS %in% res$genome.comparison$ID2) %>%
dplyr::select(-INDIVIDUALS, -STRATA) %>% dplyr::arrange(MARKERS)
id1.n.geno <- nrow(id1.data)
id2.n.geno <- nrow(id2.data)
n.markers <- dplyr::n_distinct(data$MARKERS)
data <- NULL
markers.common <- length(dplyr::intersect(x = id1.data$MARKERS, y = id2.data$MARKERS))
common.prop <- markers.common/n.markers
keep.comp <- TRUE
# threshold.common.markers <- 0.3#test
if (!is.null(threshold.common.markers)) {
keep.comp <- common.prop >= threshold.common.markers
}
if (keep.comp) {
res$genome.comparison <- res$genome.comparison %>%
dplyr::mutate(
POP_COMP = dplyr::if_else(ID1_STRATA == ID2_STRATA, "same pop", "different pop"),
MARKERS_TOTAL = n.markers,
MARKERS_COMMON = markers.common,
MARKERS_COMMON_PROP = common.prop,
MARKERS_MISSING = MARKERS_TOTAL - MARKERS_COMMON,
IDENTICAL = nrow(dplyr::intersect(x = id1.data, y = id2.data)),
DIFFERENT = MARKERS_COMMON - IDENTICAL,
IDENTICAL_PROP = IDENTICAL / MARKERS_COMMON,
IDENTICAL_PROP_TOTAL = IDENTICAL / MARKERS_TOTAL,
GENOTYPED_MIN = dplyr::if_else(id1.n.geno < id2.n.geno, ID1, ID2),
GENOTYPED_MAX = dplyr::if_else(id1.n.geno > id2.n.geno, ID1, ID2),
PAIRWISE = "pairwise comparison",
METHOD = "genome similarity") %>%
dplyr::select(PAIRS, ID1, ID2, ID1_STRATA, ID2_STRATA,
POP_COMP, MARKERS_TOTAL, MARKERS_COMMON,
MARKERS_COMMON_PROP, MARKERS_MISSING, IDENTICAL,
DIFFERENT, IDENTICAL_PROP, IDENTICAL_PROP_TOTAL, GENOTYPED_MIN,
GENOTYPED_MAX, PAIRWISE, METHOD)
res$blacklist.pairs.threshold <- NULL
# res$blacklist.pairs.threshold <- tibble::tibble(
# PAIRS = as.numeric(),
# ID1 = as.character(),
# ID2 = as.character())
} else {
message("blacklisted pairs")
res$blacklist.pairs.threshold <- dplyr::select(res$genome.comparison, PAIRS, ID1, ID2)
res$genome.comparison <- NULL
}
} else {#using GT
data %<>% dplyr::filter(INDIVIDUALS %in% list.pair & n != 0)
# genotypes & markers
id1.data <- dplyr::filter(.data = data, INDIVIDUALS %in% id1) %>%
dplyr::select(-INDIVIDUALS)
id2.data <- dplyr::filter(.data = data, INDIVIDUALS %in% id2) %>%
dplyr::select(-INDIVIDUALS)
# markers
id1.markers <- dplyr::distinct(id1.data, MARKERS)
id2.markers <- dplyr::distinct(id2.data, MARKERS)
# Check the number of markers in common
# number.common.markers <- nrow(dplyr::intersect(x = id1.markers, y = id2.markers))
# if (is.null(common.markers.threshold)) common.markers.threshold <- number.common.markers
# output comparison
genome.comparison <- tibble::tibble(
ID1 = id1,
ID2 = id2,
MARKERS_COMMON = nrow(dplyr::intersect(x = id1.markers, y = id2.markers)),
IDENTICAL = nrow(dplyr::intersect(x = id1.data, y = id2.data) %>% dplyr::distinct(MARKERS)),
DIFFERENT = MARKERS_COMMON - IDENTICAL,
PROP_IDENTICAL = IDENTICAL / MARKERS_COMMON
)
}
#unused objets:
list.pair <- id1 <- id2 <- data <- id1.data <- id2.data <- NULL
id1.n.geno <- id2.n.geno <- keep.comp <- markers.common <- common.prop <- NULL
return(res)
}#End genome_similarity_map
genome.comparison <- purrr::map(
.x = all.pairs$PAIRS,
.f = genome_similarity_map,
some.pairs = all.pairs,
data = data,
threshold.common.markers = threshold.common.markers)
all.pairs <- NULL
blacklist.pairs.threshold <- genome.comparison %>%
purrr::map_df("blacklist.pairs.threshold")
if (nrow(blacklist.pairs.threshold) > 0) {
readr::write_tsv(
x = blacklist.pairs.threshold,
file = blacklist.pairs.filename,
append = TRUE, col_names = FALSE)
}
genome.comparison %<>%
purrr::map_df("genome.comparison") %>%
readr::write_tsv( x = ., file = pairwise.filename, append = TRUE, col_names = FALSE)
if (!keep.data) genome.comparison <- NULL
blacklist.pairs.threshold <- NULL
return(genome.comparison)
})#End genome_similarity
# calculate allele count in parallel -------------------------------------------
#' @title allele_count
#' @description to calculate allele count in parallel
#' @rdname allele_count
#' @export
#' @keywords internal
allele_count <- carrier::crate(function(x) {
`%>%` <- magrittr::`%>%`
`%<>%` <- magrittr::`%<>%`
x %<>%
dplyr::ungroup(.) %>%
dplyr::select(MARKERS, INDIVIDUALS, GT) %>%
dplyr::filter(GT != "000000") %>%
dplyr::mutate(
A1 = stringi::stri_sub(str = GT, from = 1, to = 3),
A2 = stringi::stri_sub(str = GT, from = 4, to = 6)
) %>%
dplyr::select(-GT) %>%
radiator::rad_long(
x = .,
cols = c("INDIVIDUALS", "MARKERS"),
names_to = "ALLELES",
values_to = "GT"
) %>%
dplyr::arrange(MARKERS, INDIVIDUALS, GT) %>%
dplyr::count(x = ., INDIVIDUALS, MARKERS, GT) %>%
dplyr::ungroup(.) %>%
tidyr::complete(
data = ., INDIVIDUALS, tidyr::nesting(MARKERS, GT), fill = list(n = 0))
return(x)
})#End allele_count
# remove_duplicates -----------------------------------------------------------
#' @name remove_duplicates
#' @title Read tidy genomic data file ending .rad
#' @description Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' To remove duplicate individuals based on threshold established from the
#' visualization figures.
#' @param data (path) The individual's pairwise data.
#' Default: \code{data = "individuals.pairwise.dist.tsv"}.
#' @param stats (path) The genotype statistics
#' Default: \code{stats = "genotyped.statistics.tsv"}.
#' @param dup.threshold (double) The threshold to filter out duplicates
#' Default: \code{dup.threshold = 0.25}.
#' @param diff.pop.remove Remove all individuals in pairs from different pop.
#' Both samples are potentially problems. With defautl, the function will not keep
#' one sample in the duplicate pair.
#' Default: \code{diff.pop.remove = TRUE}.
#' @return A list with blacklisted duplicates. Write the blacklist in the working
#' directory.
#' @export
#' @keywords internal
#' @rdname remove_duplicates
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
remove_duplicates <- function(
data = "individuals.pairwise.dist.tsv",
stats = "genotyped.statistics.tsv",
dup.threshold = 0.25,
diff.pop.remove = TRUE,
path.folder = NULL
) {
if (is.null(path.folder)) path.folder <- getwd()
data <- file.path(path.folder, data)
stats <- file.path(path.folder, stats)
dup.filtered <- suppressWarnings(suppressMessages(readr::read_tsv(data))) %>%
dplyr::mutate(ID1 = as.character(ID1), ID2 = as.character(ID2))
if (rlang::has_name(dup.filtered, "DISTANCE_RELATIVE")) {
dup.filtered <- dup.filtered %>%
dplyr::filter(DISTANCE_RELATIVE < dup.threshold)
} else {
dup.filtered <- dup.filtered %>%
dplyr::filter(PROP_IDENTICAL > dup.threshold)
}
if (nrow(dup.filtered) > 0) {
dup.list.names <- tibble::tibble(INDIVIDUALS = c(dup.filtered$ID1, dup.filtered$ID2)) %>%
dplyr::group_by(INDIVIDUALS) %>%
dplyr::tally(.) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::distinct(INDIVIDUALS) %>%
purrr::flatten_chr(.)
geno.stats <- readr::read_tsv(stats, col_types = "cd") %>%
dplyr::filter(INDIVIDUALS %in% dup.list.names)
res <- list(blacklist.id = tibble::tibble(INDIVIDUALS = character(0)),
whitelist.id = tibble::tibble(INDIVIDUALS = character(0)))
for (i in dup.list.names) {
# i <- dup.list.names[1]
dups <- dplyr::filter(dup.filtered, ID1 %in% i | ID2 %in% i)
dups <- sort(unique(c(dups$ID1, dups$ID2)))
# find all duplicates associated with the network
new.dups <- 0L
while(length(new.dups) > 0) {
new.dups <- dplyr::filter(dup.filtered, ID1 %in% dups | ID2 %in% dups)
new.dups <- sort(unique(c(new.dups$ID1, new.dups$ID2)))
new.dups <- purrr::keep(.x = new.dups, .p = !new.dups %in% dups)
if (length(new.dups) > 0) {
dups <- c(dups, new.dups)
}
}
dups <- tibble::tibble(INDIVIDUALS = dups)
if (nrow(res$blacklist.id) > 0) {
dups <- dplyr::filter(dups, !INDIVIDUALS %in% res$blacklist.id$INDIVIDUALS)
}
if (nrow(dups) > 0) {
diff.pop <- dup.filtered %>%
dplyr::filter(ID1 %in% dups$INDIVIDUALS | ID2 %in% dups$INDIVIDUALS) %>%
dplyr::distinct(POP_COMP) %>%
dplyr::filter(POP_COMP == "different pop")
if (diff.pop.remove && (nrow(diff.pop) > 0)) {
blacklist.diff.pop <- dup.filtered %>%
dplyr::filter(ID1 %in% dups$INDIVIDUALS | ID2 %in% dups$INDIVIDUALS) %>%
dplyr::distinct(POP_COMP) %>%
dplyr::filter(POP_COMP == "different pop")
if (nrow(blacklist.diff.pop) > 0) {
res$blacklist.id <- dplyr::bind_rows(res$blacklist.id, dups)
}
blacklist.diff.pop <- NULL
} else {
whitelist.id <- dups %>%
dplyr::left_join(geno.stats, by = "INDIVIDUALS") %>%
dplyr::filter(GENOTYPED_PROP == max(GENOTYPED_PROP)) %>%
dplyr::sample_n(tbl = ., size = 1) %>% # make sure only 1 is selected
dplyr::select(INDIVIDUALS)
if (nrow(whitelist.id) > 0) res$whitelist.id <- dplyr::bind_rows(res$whitelist.id, whitelist.id)
blacklist.id <- dplyr::filter(dups, !INDIVIDUALS %in% whitelist.id$INDIVIDUALS) %>%
dplyr::select(INDIVIDUALS)
if (nrow(blacklist.id) > 0) res$blacklist.id <- dplyr::bind_rows(res$blacklist.id, blacklist.id)
}
diff.pop <- NULL
}
}
dups <- blacklist.id <- whitelist.id <- i <- new.dups <- NULL
res$blacklist.id <- dplyr::distinct(res$blacklist.id, INDIVIDUALS)
res$whitelist.id <- dplyr::distinct(res$whitelist.id, INDIVIDUALS)
message("With threshold selected, ", nrow(res$blacklist.id) ," individual(s) blacklisted")
readr::write_tsv(x = res$blacklist.id, file = file.path(path.folder, "blacklist.id.similar.tsv"))
message("Written in the directory: blacklist.id.similar.tsv")
} else {
message("With threshold selected, the blacklist of duplicate individuals is empty")
res <- list(blacklist.id = tibble::tibble(INDIVIDUALS = character(0)),
whitelist.id = tibble::tibble(INDIVIDUALS = character(0)))
}
return(res)
} # End remove_duplicates
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.