R/filter_rad.R

Defines functions filter_rad

Documented in filter_rad

# Import, filter and transform a dart output file to different formats

#' @name filter_rad

#' @title ONE FUNCTION TO RULE THEM ALL

#' @description Designed for RADseq data, it's radiator integrated pipeline that links
#' several \code{filter_} functions of \pkg{radiator}.
#' Rapidly get an idea of what you can and cannot do with your dataset.
#' Novices, start with this one!

#' @inheritParams genomic_converter
#' @inheritParams tidy_genomic_data
#' @inheritParams radiator_common_arguments
#' @inheritParams detect_mixed_genomes
#' @inheritParams filter_hwe
#' @inheritParams read_strata

# @param filter.markers.coverage (optional, string, numerical) Filter the lower and
# upper bound of locus/read coverage. The locus/read coverage combines the markers
# average count for REF and ALT allele (respectively the \code{AvgCountRef} and
# \code{AvgCountSnp} info). These markers statistics are generated by DArT.
# If you have count data, use \code{erase.genotypes} argument below instead.
# Default: \code{filter.markers.coverage = NULL}.
# e.g to keep markers with coverage inbetween 7 and 200,
# use : \code{filter.markers.coverage = c(7, 200)}.

# param erase.genotypes (optional, string, numerical) DArT file with count
# data is required for this argument to work. With count data, genotype,
# REF and ALT coverage information is available and is better suited than
# \code{filter.markers.coverage} to remove/erase data based on coverage info.
# This function argument requires 3 values in the string:
#   \enumerate{
#     \item threshold.low.coverage: threshold for the minimum read coverage. Under this threshold,
#     genotypes are erased. e.g. 7
#     \item threshold.gl: threshold that applies only for heterozygous genotypes.
#     Using the \code{threshold.low.coverage} doesn't guarantees that REF and ALT
#     allele have adequate coverage. This threshold does.
#     A genotype likelihood value is generated based
#     on the departure of equal coverage between REF and ALT allele, number of samples
#     sharing the heterozygous genotype for the locus, missing data for the locus
#     and individual. Below the genotype likelihood threshold value,
#     the heterozygous genotypes are erased. e.g. if an heterozygous genotype for a
#     marker as REF/ALT coverage of 100/3 with only 1 sample sharing this info and
#     this sample has 50% missing data and the marker missingness is averaged, the
#     GL value will be extremely low compared to another heterozygous genotype with
#     50/50 of coverage and 10 samples sharing the same genotype...
#     \item threshold.high.coverage: threshold that allows to erase genotypes with
#     very high coverage
#   }
#     e.g. of values: \code{erase.genotypes = c(7, -0.25, 200)}.
#     However, using the \code{interactive.filter = TRUE} is highly recommended to
#     visualize data before choosing values.
#     Default: \code{erase.genotypes = NULL}.

# @param filter.individuals.missing (optional, double) New argument to blacklist
# individuals with too many missing genotypes. Below the threshold, individuals
# are blacklisted. e.g. 0.80 will blacklist individuals with more than 20% missing
# genotypes.
# Default: \code{filter.individuals.missing = NULL}.


# @param filter.markers.missing (optional, string) Similar to call rate, but
# more adapted to the data. 3 values are required in the string, corresponding
# to the \code{\link[radiator]{filter_individual}} module of radiator.
# First value is the approach to count genotyped individuals per markers, \code{"overall"}
# or by \code{"pop"}. Second value is the percent threshold for the marker, with
# \code{70}, 70 percent of genotyped individuals are required to keep the marker.
# The last threshold is the number of problematic population that are allowed to skip
# the threshold. In doubt, use the interactive mode that take step by step these
# arguments. e.g to keep individuals genotyped at >= 70 percent for the markers,
# without considering the population info and allowing 1 population to be problematic for the
# threshold, use: \code{c("overall", 70, 1)}.
# Default: \code{filter.markers.missing = NULL}.


# @param number.snp.reads (optional, integer) This filter removes outlier markers
# with too many SNP number per locus/read.
# Having a higher than "normal" SNP number is usually the results of
# assembly artifacts or bad assembly parameters.
# This filter is population-agnostic. This is best decide after viewing the figures,
# with the interactive mode.
# If the argument is set to \code{number.snp.reads = 2},
# locus with 3 and more SNPs will be blacklisted.
# Default: \code{number.snp.reads = NULL}.

# @param detect.mixed.genomes (optional, logical) Highlight outliers individual's
# observed heterozygosity for a quick
# diagnostic of mixed samples or poor polymorphism discovery due to DNA quality,
# sequencing effort, etc.
# See this function for more info: \code{\link[radiator]{detect_mixed_genomes}}.
# Default: \code{detect.mixed.genomes = TRUE}.


# @param duplicate.genomes.analysis (optional, string) Detect duplicate individuals.
# The function can compute two methods (distance or genome pairwise similarity)
# to highligh potential duplicate individuals.
# See this function for more info: \code{\link[radiator]{detect_duplicate_genomes}}.
# The string required to run the analysis as 2 values:
# \enumerate{
# \item TRUE/FALSE to run the analysis;
# \item Computes pairwise genome similarity (TRUE/FALSE),
# with FALSE just the distance measure is used.
# The pairwise genome similarity is longer to run, but is better because it
# integrates markers in common/missing data.
# Using \code{interactive.filter = TRUE}, can overide this value,
# you can opt in for the pairwise genome similarity after viewing the figures
# used with distance measure... handy!
# }
# Default: \code{detect_duplicate_genomes = c(TRUE, FALSE)}.


#' @param filename (optional) The filename prefix for the objet in the global environment
#' or the working directory. Default: \code{filename = NULL}. A default name will be used,
#' customized with the output file(s) selected.

#' @param ... (optional) To pass further argument for fine-tuning the function.

#' @return The function returns an object (list). The content of the object
#' can be listed with \code{names(object)} and use \code{$} to isolate specific
#' object (see examples). Some output format will write the output file in the
#' working directory. The tidy genomic data frame is generated automatically.

#' @examples
#' \dontrun{
#' # Some of the packages used in this function are not installed by default.
#' # Installed the required packages:
#' radiator_pkg_install()
#'
#' # Very simple:
#' shark <- radiator::filter_rad(
#'     data = "data.shark.vcf",
#'     strata = "strata.shark.tsv")
#'
#'
#' # With filename and output
#' shark <- radiator::filter_rad(
#'     data = "data.shark.vcf",
#'     strata = "strata.shark.tsv",
#'     output = "genind",
#'     filename = "shark")
#' }



#' @section Filtering steps and folders generated:
#' \enumerate{
#' \item \strong{radiator}:
#' \itemize{
#' \item the GDS file ending with \code{.gds.rad}. To open in R use \code{\link{read_rad}}.
#' \item \code{filters_parameters.tsv}: file containing all the parameters and
#' values of the filtering process.
#' \item \code{radiator_filter_rad_args.tsv}: the function calls and values, for
#' reproducibility.
#' \item \code{radiator_tidy_dart_metadata.rad}: for DArT data, this is the markers
#' metadata file. To open in R use \code{\link{read_rad}}.
#' \item \code{random.seed}: for reproducibility.
#' }
#' \item \strong{filter_dart_reproducibility}: filter reproducibility of markers
#' (DArT data only). Described in \code{\link[radiator]{filter_dart_reproducibility}}.
#' \itemize{
#' \item \code{blacklist.dart.reproducibility.tsv}: blacklisted markers.
#' \item \code{whitelist.dart.reproducibility.tsv}: whitelisted markers.
#' \item \code{dart_reproducibility_boxplot.pdf}: the boxplot of reproducibility.
#' \item \code{dart_reproducibility_stats.tsv}: reproducibility associated summary statistics.
#' \item \code{dart.reproducibility.helper.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{dart.reproducibility.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted.
#' \item \code{radiator_filter_dart_reproducibility_args.tsv}: the function calls and values.
#' }
#'
#' \item \strong{filter_monomorphic}: removes monomorphic markers.
#' Described in \code{\link[radiator]{filter_monomorphic}}.
#' \itemize{
#' \item \code{blacklist.monomorphic.markers.tsv}: blacklisted markers.
#' \item \code{whitelist.polymorphic.markers}: whitelisted markers.
#' \item \code{radiator_filter_monomorphic_args}: the function calls and values.
#' }
#' \item \strong{filter_common_markers}: keep only markers in common between strata.
#' Described in \code{\link[radiator]{filter_common_markers}}.
#' \itemize{
#' \item \code{common.markers.upsetrplot.pdf}: the UpSetR plot highlight the number
#' of markers in common between strata.
#' \item \code{blacklist.common.markers.tsv}: blacklisted markers.
#' \item \code{whitelist.common.markers.tsv}: whitelisted markers.
#' \item \code{radiator_filter_common_markers_args.tsv}: the function calls and values.
#' }
#'
#' \item \strong{filter_individuals}: blacklist individuals based on missingness
#' and/or heterozygosity and/or total coverage.
#' Described in \code{\link[radiator]{filter_individuals}}.
#' \itemize{
#' \item \code{individuals.qc.pdf}: several boxplots showing different individual
#' quality metrics.
#' \item \code{individuals.qc.stats.tsv}: tibble with individual's
#' proportion of missing genotypes, heterozygosity, total and mean coverage.
#' \item \code{individuals.qc.stats.summary.tsv}: individual's summary statistics.
#' \item \code{blacklist.individuals.missing.tsv}: blacklisted individuals based
#' on missingness (genotyping rate of individuals).
#' \item \code{radiator_filter_individuals_args.tsv}: the function calls and values.
#' }
#' The function will remove automatically monomorphic markers if individuals are removed.
#' \item \strong{filter_ma}: remove/blacklist markers based on Minor/Alternate Allele Count (MAC), Frequency (MAF) or Depth (MAD).
#' Described in \code{\link[radiator]{filter_ma}}.
#' \itemize{
#' \item \code{distribution.mac.global.pdf}: distribution of overall MAC.
#' \item \code{ma.boxplot.pdf}: boxplot of the MAC.
#' \item \code{ma.global.tsv}: a tibble with the global MAC and MAF.
#' \item \code{mac.markers.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{mac.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted.
#' \item \code{ma.summary.stats.tsv}: MAC summary statistics.
#' \item \code{blacklist.markers.ma.tsv}: blacklisted markers.
#' \item \code{whitelist.markers.ma.tsv}: whitelisted markers.
#' \item \code{radiator_filter_ma_args.tsv}: the function calls and values.
#' }
#' \item \strong{filter_coverage}: remove/blacklist markers based on mean coverage information.
#' Described in \code{\link[radiator]{filter_coverage}}.
#' \itemize{
#' \item \code{markers_metadata.tsv}: all the markers coverage metadata available.
#' \item \code{markers_metadata_stats.tsv}: summary statistics of coverage information.
#' \item \code{markers_qc.pdf}: several coverage boxplots (total coverage, mean coverage, etc.)
#' \item \code{coverage.low.helper.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{coverage.low.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted.
#' \item \code{coverage.high.helper.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{coverage.high.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted.
#' \item \code{blacklist.markers.coverage.tsv}: blacklisted markers.
#' \item \code{whitelist.markers.coverage.tsv}: whitelisted markers.
#' \item \code{radiator_filter_coverage_args.tsv}: the function calls and values.
#' }
#' \item \strong{filter_genotyping}: remove/blacklist markers based on genotyping/call rate.
#' Described in \code{\link[radiator]{filter_genotyping}}.
#' \itemize{
#' \item \code{markers_qc.pdf}: the missing genotypes boxplot.
#' \item \code{markers_metadata.tsv}: the missing proportion per markers along other stats.
#' \item \code{markers_metadata_stats.tsv}: the summary statistics of markes genotyping rate..
#' \item \code{markers.genotyping.helper.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{genotyping.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted (overall).
#' \item \code{markers.pop.missing.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted (strata).
#' \item \code{blacklist.markers.genotyping.tsv}: blacklisted markers.
#' \item \code{whitelist.markers.genotyping.tsv}: whitelisted markers.
#' \item \code{radiator_filter_genotyping_args.tsv}: the function calls and values.
#' }
#' \item \strong{filter_snp_position_read}: removes markers/SNPs based on their
#' position on the read
#' Described in \code{\link[radiator]{filter_snp_position_read}}.
#' \itemize{
#' \item \code{snp.position.read.boxplot.pdf}: boxplot of SNP position on the read
#' \item \code{snp.position.read.helper.table.tsv}: a tibble with
#' the impact of thresholds on the the number
#' of markers blacklisted and whitelisted.
#' \item \code{snp.position.read.distribution.pdf}: distribution of the SNP position
#' on the read.
#' \item \code{blacklits.markers.snp.position.read.tsv}: blacklisted markers.
#' \item \code{whitelist.markers.snp.position.read.tsv}: whitelisted markers.
#' \item \code{radiator_filter_snp_position_read_args.tsv}: the function calls and values.
#' }
#' \item \strong{filter_snp_number}: removes outlier markers with too many SNP
#' number per locus/read.
#' Described in \code{\link[radiator]{filter_snp_number}}.
#' \itemize{
#' \item \code{markers_metadata.tsv}: metadata associated with the number of SNP/locus
#' \item \code{snp_per_locus.pdf}: boxplot of number of SNP/locus.
#' \item \code{snp_per_locus_distribution.pdf}: distribution on the number od
#' \item \code{snp.per.locus.helper.plot.pdf}: the helper plot showing
#' the impact of thresholds
#' on the number of markers blacklisted and whitelisted.
#' \item \code{blacklist.snp.per.locus.tsv}: blacklisted markers.
#' \item \code{whitelist.snp.per.locus.tsv}: whitelisted markers.
#' \item \code{radiator_filter_snp_number_args.tsv}: the function calls and values.
#' }

#' \item \strong{filter_ld}: SNP short and long distance linkage disequilibrium
#' pruning.
#' Described in \code{\link[radiator]{filter_ld}}.
#' \itemize{
#' \item \code{short.ld.locus.stats.tsv}: number of locus with more than 1, 2, 3 SNPs.
#' \item \code{blacklist.short.ld.tsv}: blacklisted markers.
#' \item \code{whitelist.short.ld.tsv}: whitelisted markers.
#' \item \code{blacklist.long.ld.tsv}: blacklisted markers.
#' \item \code{whitelist.long.ld.tsv}: whitelisted markers.
#' \item \code{radiator_filter_ld_args.tsv}: the function calls and values.
#' }
#' \item \strong{detect_mixed_genomes}: highlight outliers individual's.
#' observed heterozygosity.
#' Described in \code{\link[radiator]{detect_mixed_genomes}}.
#' \itemize{
#' \item \code{individuals.qc.stats.tsv}: individual's heterozygosity and
#' missingness proportion.
#' \item \code{individuals.qc.stats.summary.tsv}: heterozygosity summary statistics.
#' \item \code{heterozygosity.statistics.tsv}: heterozygosity summary statistics
#' per strata and overall.
#' \item \code{individual.heterozygosity.manhattan.plot.pdf}: manhattan plot
#' highlighting potential patterns of heterozygosity and missingness.
#' \item \code{individual.heterozygosity.boxplot.pdf}: boxplot
#' highlighting potential patterns of heterozygosity and missingness.
#' \item \code{blacklist.ind.het.tsv}: blacklisted individuals.
#' \item \code{radiator_detect_mixed_genomes_args.tsv}: the function calls and values.
#' }
#' The function will remove automatically monomorphic markers if individuals are removed.
#' \item \strong{detect_duplicate_genomes}: highligh potential duplicate individuals.
#' Described in \code{\link[radiator]{detect_duplicate_genomes}}.
#' \itemize{
#' \item \code{genotyped.statistics.tsv}: genotyping statistics used in the analysis.
#' \item \code{individuals.pairwise.dist.tsv}: pairwise distances measures.
#' \item \code{individuals.pairwise.distance.stats.tsv}: summary statistics of
#' pairwise distances measures.
#' \item \code{individuals.qc.stats.tsv}: individuals summary produced by default.
#' \item \code{individuals.qc.stats.summary.tsv}: individuals summary produced by default.
#' \item \code{manhattan.plot.distance.pdf}: manhattan plot highlighting duplicates.
#' \item \code{violin.plot.distance.pdf}: boxplot highlighting duplicates.
#' \item \code{blacklist.id.similar.tsv}: blacklisted individuals.
#' \item \code{radiator_detect_duplicate_genomes_args.tsv}: the function calls and values.
#' }
#' \item \strong{filter_hwe}: remove/blacklist markers based on Hardy-Weinberg Equilibrium
#' Described in \code{\link[radiator]{filter_hwe}}.
#' \itemize{
#' \item \code{genotypes.summary.tsv}: summary of genotypes groups.
#' \item \code{hwd.helper.table.tsv}: number of markers blacklisted based on thresholds.
#' \item \code{hw.pop.sum.tsv}: summary of hwe/hwd.
#' \item \code{hwd.plot.blacklist.markers.pdf}: helper plot.
#' \item \code{hwe.manhattan.plot.pdf}: overview of markers in HWD.
#' \item \code{hwe.ternary.plots.missing.data.pdf}: overview of markers in HWD.
#' \item \code{blacklisted and whitelisted markers based on thresholds}.
#' \item \code{radiator_filter_hwe_args.tsv}: the function calls and values.
#' }
#'
#' \item \strong{filtered}:
#' \itemize{
#' \item \code{.rad} file is the tidy data set. To open in R use \code{\link{read_rad}}.
#' \item \code{strata.filtered.tsv} the filtered strata file.
#' \item \code{blacklist.id.tsv}: blacklisted individuals (total).
#' \item \code{blacklist.markers.tsv}: blacklisted markers.
#' \item \code{whitelist.markers.tsv}: whitelisted markers.
#' \item \code{markers_qc.pdf}: markers qc after filters.
#' \item \code{individuals.qc.pdf}: individuals qc after filters.
#' \item \code{individuals.qc.stats.tsv}: individuals qc stats.
#' \item \code{individuals.qc.stats.summary.tsv}: individuals qc stats summary.
#' \item \code{markers_metadata.tsv}: markers final metadata.
#' \item \code{markers_metadata_stats.tsv}: markers metadata stats.
#' \item \code{radiator_genind_.RData} and \code{radiator_stockr.RData}, the
#' genind and stockr files generated by the output argument.
#' }
#'}


#' @section Advance mode:
#'
#' Ideally, use the function in interactive mode and forget about this section.
#' For advance user who want to run in batch mode with pre-defined arguments.
#' Below are the arguments available. Read the function documentation associated
#' with the arguments. Most of them reside in separate module that can be explored
#' separately.
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function:
#' \enumerate{
#' \item \strong{filter.reproducibility}: detailed in \code{\link[radiator]{filter_dart_reproducibility}}.
#' \item \strong{filter.monomorphic}: detailed in \code{\link[radiator]{filter_monomorphic}}.
#' \item \strong{filter.common.markers}: detailed in \code{\link[radiator]{filter_common_markers}}.
#' \item \code{whitelist.markers}: detailed in \code{\link[radiator]{filter_whitelist}}.
#' \item \strong{filter.individuals.missing}: detailed in \code{\link[radiator]{filter_individuals}}.
#' \item \strong{filter.individuals.heterozygosity}: detailed in \code{\link[radiator]{filter_individuals}}.
#' \item \strong{filter.individuals.coverage.total}: detailed in \code{\link[radiator]{filter_individuals}}.
#' \item \strong{filter.ma}: detailed in \code{\link[radiator]{filter_ma}}.
#' \item \strong{filter.coverage}: detailed in \code{\link[radiator]{filter_coverage}}.
#' \item \strong{filter.genotyping}: described in \code{\link[radiator]{filter_genotyping}}.
#' \item \strong{filter.snp.position.read}: described in \code{\link[radiator]{filter_snp_position_read}}.
#' \item \strong{filter.snp.number}: described in \code{\link[radiator]{filter_snp_number}}.
#' \item \strong{filter.short.ld}: described in \code{\link[radiator]{filter_ld}}.
#' \item \strong{filter.long.ld}: described in \code{\link[radiator]{filter_ld}}.
#' \item \strong{long.ld.missing}: described in \code{\link[radiator]{filter_ld}}.
#' \item \strong{ld.method}: described in \code{\link[radiator]{filter_ld}}.
#' \item \strong{detect.mixed.genomes}: described in \code{\link[radiator]{detect_mixed_genomes}}.
#' \item \strong{detect.duplicate.genomes}: described in \code{\link[radiator]{detect_duplicate_genomes}}.
#' \item \strong{dup.threshold}: described in \code{\link[radiator]{detect_duplicate_genomes}}.
#' \item \strong{filter.hwe}: described in \code{\link[radiator]{filter_hwe}}.
#' \item \strong{hw.pop.threshold}: described in \code{\link[radiator]{filter_hwe}}.
#' \item \strong{midp.threshold}: described in \code{\link[radiator]{filter_hwe}}.
#' \item \strong{filter.hwe}: described in \code{\link[radiator]{filter_hwe}}.
#' \item \strong{filter.hwe}: described in \code{\link[radiator]{filter_hwe}}.
#' \item \strong{ind.heterozygosity.threshold}: described in \code{\link[radiator]{detect_mixed_genomes}}.
#' \item \strong{ind.heterozygosity.threshold}: described in \code{\link[radiator]{detect_mixed_genomes}}.
#' \item \strong{ind.heterozygosity.threshold}: described in \code{\link[radiator]{detect_mixed_genomes}}.
#' }

#' @export
#' @rdname filter_rad
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

filter_rad <- function(
  data,
  strata = NULL,
  interactive.filter = TRUE,
  output = NULL,
  filename = NULL,
  verbose = TRUE,
  parallel.core = parallel::detectCores() - 1,
  ...
) {

  # Cleanup---------------------------------------------------------------------
  radiator_function_header(f.name = "filter_rad", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  res <- list()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "filter_rad", start = FALSE, verbose = verbose), add = TRUE)

  # Required package -----------------------------------------------------------
  if (interactive.filter) {
    radiator_packages_dep(package = "BiocManager")
    radiator_packages_dep(package = "HardyWeinberg")
    # radiator_packages_dep(package = "ggtern")
    radiator_packages_dep(package = "UpSetR")
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
    radiator_packages_dep(package = "SNPRelate", cran = FALSE, bioc = TRUE)
  }

  # Function call and dotslist -------------------------------------------------
  rad.dots <- radiator_dots(
    func.name = as.list(sys.call())[[1]],
    fd = rlang::fn_fmls_names(),
    args.list = as.list(environment()),
    dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
    keepers = c(
      "subsample.markers.stats",
      "filter.reproducibility", "filter.individuals.missing",
      "filter.individuals.heterozygosity", "filter.individuals.coverage.total",
      "filter.individuals.coverage.median", "filter.individuals.coverage.iqr",
      "filter.common.markers", "filter.monomorphic", "filter.ma",
      "filter.coverage", "filter.genotyping", "filter.snp.position.read",
      "filter.snp.number", "filter.short.ld", "filter.long.ld", "long.ld.missing",
      "ld.method", "detect.mixed.genomes", "ind.heterozygosity.threshold",
      "detect.duplicate.genomes", "dup.threshold",
      "filter.hwe", "hw.pop.threshold", "midp.threshold",
      "filter.strands", "random.seed", "path.folder", "filename",
      "blacklist.genotype", "erase.genotypes",
      "gt", "gt.bin", "gt.vcf", "gt.vcf.nuc",
      "pop.levels",
      "whitelist.markers",
      "write.tidy",
      "missing.memory", "internal"),
    deprecated = c("pop.select, pop.labels", "blacklist.id"),
    verbose = FALSE
  )
  filter.common.markers.bk <- filter.common.markers
  filter.monomorphic.bk <- filter.monomorphic
  filter.individuals.missing.bk <- filter.individuals.missing
  filter.individuals.heterozygosity.bk <- filter.individuals.heterozygosity
  filter.individuals.coverage.total.bk <- filter.individuals.coverage.total
  filter.individuals.coverage.median.bk <- filter.individuals.coverage.median
  filter.individuals.coverage.iqr.bk <- filter.individuals.coverage.iqr
  filter.ma.bk <- filter.ma
  filter.coverage.bk <- filter.coverage
  filter.genotyping.bk <- filter.genotyping
  filter.snp.position.read.bk <- filter.snp.position.read
  filter.snp.number.bk <- filter.snp.number
  filter.short.ld.bk <- filter.short.ld
  filter.long.ld.bk <- filter.long.ld
  long.ld.missing.bk <- long.ld.missing
  ld.method.bk <- ld.method
  detect.mixed.genomes.bk <- detect.mixed.genomes
  ind.heterozygosity.threshold.bk <- ind.heterozygosity.threshold
  detect.duplicate.genomes.bk <- detect.duplicate.genomes
  dup.threshold.bk <- dup.threshold
  filter.hwe.bk <- filter.hwe
  hw.pop.threshold.bk <- hw.pop.threshold
  midp.threshold.bk <- midp.threshold


  # Test
  # internal <- FALSE
  # path.folder <- NULL
  # random.seed <- NULL
  # pop.levels <- NULL


  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("data is missing")

  # Folders---------------------------------------------------------------------
  # wf for working folder
  # radiator.folder : the 01_radiator folder to put most things...
  wf <- path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "filter_rad",
    prefix_int = FALSE,
    internal = internal,
    file.date = file.date,
    verbose = verbose)

  radiator.folder <- generate_folder(
    f = path.folder,
    rad.folder = "radiator",
    prefix_int = TRUE,
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # write the dots file
  write_rad(
    data = rad.dots,
    path = radiator.folder,
    filename = stringi::stri_join(
      "radiator_filter_rad_args_", file.date, ".tsv"),
    tsv = TRUE,
    internal = internal,
    write.message = "Function call and arguments stored in: ",
    verbose = verbose
  )

  # Random seed ----------------------------------------------------------------
  if (is.null(random.seed)) {
    random.seed <- sample(x = 1:1000000, size = 1)
    set.seed(random.seed)
  } else {
    set.seed(random.seed)
  }
  readr::write_lines(x = random.seed, file = file.path(radiator.folder, "random.seed"))
  if (verbose) message("File written: random.seed (", random.seed,")")

  # Filter parameter file: generate --------------------------------------------
  filters.parameters <- radiator_parameters(
    generate = TRUE,
    initiate = FALSE,
    update = FALSE,
    parameter.obj = NULL,
    path.folder = radiator.folder,
    file.date = file.date,
    verbose = verbose)

  # File type detection----------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)

  # Import file ----------------------------------------------------------------
  if (data.type %in% c(
    "tbl_df", "fst.file", "SeqVarGDSClass", "gds.file", "vcf.file", "dart", "plink.bed.file")) {

    if (data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
      gds <- read_rad(data)
      data.type <- radiator::detect_genomic_format(gds)
      data <- NULL
    }

    if (data.type %in% c("vcf.file")) {
      gds <- read_vcf(
        data = data,
        strata = strata,
        filter.strands = filter.strands,
        # random.seed = random.seed,
        filter.monomorphic = FALSE,
        filter.common.markers = FALSE,
        internal = TRUE,
        # filters.parameters = filters.parameters,
        path.folder = radiator.folder,
        parallel.core = parallel.core,
        verbose = FALSE)
      data.type <- "SeqVarGDSClass"
    }

    if (data.type %in% c("dart")) {
      gds <- read_dart(
        data = data,
        strata = strata,
        filename = filename,
        verbose = FALSE,
        parallel.core = parallel.core,
        path.folder = radiator.folder,
        pop.levels = pop.levels,
        internal = TRUE
      )
      data.type <- "SeqVarGDSClass"
    }

    if (data.type %in% c("plink.bed.file")) {
      gds <- read_plink(
        data = data,
        filename = filename,
        parallel.core = parallel.core,
        verbose = FALSE,
        path.folder = radiator.folder,
        internal = TRUE
      )
      data.type <- "SeqVarGDSClass"
    }
  } else {
    gds <- radiator::tidy_genomic_data(
      data = data,
      strata = strata,
      parallel.core = parallel.core,
      verbose = FALSE,
      internal = TRUE,
      vcf.metadata = TRUE,
      vcf.stats = TRUE,
      gt.vcf.nuc = FALSE,
      gt.vcf = FALSE,
      gt = FALSE,
      gt.bin = TRUE
    )
    data.type <- radiator::detect_genomic_format(gds)
  }

  if (data.type == "tbl_df") gds <- tidy2gds(x = gds) #End tbl_df
  source <- extract_data_source(gds) # to know if dart data or not...

  # Filter reproducibility -----------------------------------------------------
  if ("dart" %in% source) {
    # Filter parameter file: initiate
    filters.parameters <- radiator_parameters(
      generate = FALSE,
      initiate = TRUE,
      update = FALSE,
      parameter.obj = filters.parameters,
      data = gds,
      path.folder = radiator.folder,
      file.date = file.date,
      verbose = verbose)

    gds <- filter_dart_reproducibility(
      data = gds,
      interactive.filter = interactive.filter,
      filter.reproducibility = filter.reproducibility,
      parallel.core = parallel.core,
      verbose = verbose,
      parameters = filters.parameters,
      path.folder = wf,
      internal = FALSE)
  }

  # Filter_monomorphic----------------------------------------------------------
  gds <- filter_monomorphic(
    data = gds,
    filter.monomorphic = filter.monomorphic.bk,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)

  # Filter common markers ------------------------------------------------------
  gds <- filter_common_markers(
    data = gds,
    filter.common.markers = filter.common.markers.bk,
    fig = TRUE,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)

  # Filter Whitelist------------------------------------------------------------
  gds <- filter_whitelist(
    data = gds,
    whitelist.markers = whitelist.markers,
    verbose = verbose,
    path.folder = wf,
    parameters = filters.parameters,
    biallelic = biallelic,
    internal = FALSE)

  # Genotypes metadata ---------------------------------------------------------
  # check for coverage information...
  # genotypes.meta <- extract_genotypes_metadata(gds, index.only = TRUE)
  # count.data <- "ALLELE_REF_DEPTH" %in% genotypes.meta
  # coverage.info <- "READ_DEPTH" %in% genotypes.meta
  # genotypes.meta <- NULL

  # Filter_individuals----------------------------------------------------------
  gds <- filter_individuals(
    data = gds,
    interactive.filter = interactive.filter,
    filter.individuals.missing = filter.individuals.missing.bk,
    filter.individuals.heterozygosity = filter.individuals.heterozygosity.bk,
    filter.individuals.coverage.total = filter.individuals.coverage.total.bk,
    filter.individuals.coverage.median = filter.individuals.coverage.median.bk,
    filter.individuals.coverage.iqr = filter.individuals.coverage.iqr.bk,
    parallel.core = parallel.core,
    verbose = verbose,
    path.folder = wf,
    parameters = filters.parameters,
    internal = FALSE
  )

  # Filter MAC------------------------------------------------------------------
  gds <- filter_ma(
    data = gds,
    interactive.filter = interactive.filter,
    filter.ma = filter.ma.bk,
    filename = NULL,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)


  # Filter coverage-------------------------------------------------------------
  gds <- filter_coverage(
    data = gds,
    interactive.filter = interactive.filter,
    filter.coverage = filter.coverage.bk,
    filename = NULL,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)

  # Filter genotyping---------------------------------------------------------
  gds <- filter_genotyping(
    data = gds,
    interactive.filter = interactive.filter,
    filter.genotyping = filter.genotyping.bk,
    filename = NULL,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)

  # Filter SNP position on the read---------------------------------------------
  gds <- filter_snp_position_read(
    data = gds,
    interactive.filter = interactive.filter,
    filter.snp.position.read = filter.snp.position.read.bk,
    filename = NULL,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)


  # Filter snp number per locus ------------------------------------------------
  gds <- filter_snp_number(
    data = gds,
    interactive.filter = interactive.filter,
    filter.snp.number = filter.snp.number.bk,
    filename = NULL,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    path.folder = wf,
    internal = FALSE)

  # Filter Linkage disequilibrium --------------------------------------------
  continue.with.ld <- "y"

  if (interactive.filter) {
    continue.with.ld <- radiator_question(
      x = "\n\nDo you want to filter the dataset using short distance ld  ? (y/n):",
      answer.opt = c("y", "n"))
  }

  if (continue.with.ld == "y") {
    gds <- filter_ld(
      data = gds,
      iinteractive.filter = interactive.filter,
      filter.short.ld = filter.short.ld.bk,
      filter.long.ld = filter.long.ld.bk,
      parallel.core = parallel.core,
      filename = NULL,
      iiiiverbose = verbose,
      long.ld.missing = long.ld.missing.bk,
      ld.method = ld.method.bk,
      parameters = filters.parameters,
      path.folder = wf,
      internal = FALSE)
  }

  # Detect mixed genomes -------------------------------------------------------
  gds <- detect_mixed_genomes(
    data = gds,
    interactive.filter = interactive.filter,
    detect.mixed.genomes = detect.mixed.genomes.bk,
    ind.heterozygosity.threshold = NULL,
    parameters = filters.parameters,
    verbose = verbose,
    parallel.core = parallel.core,
    path.folder = wf,
    internal = FALSE)

  #Detect duplicate genomes ---------------------------------------------------
  gds <- detect_duplicate_genomes(
    data = gds,
    interactive.filter = interactive.filter,
    detect.duplicate.genomes = detect.duplicate.genomes.bk,
    dup.threshold = dup.threshold,
    parallel.core = parallel.core,
    verbose = verbose,
    parameters = filters.parameters,
    random.seed = random.seed,
    path.folder = wf,
    internal = FALSE)

  # Filter HWE -----------------------------------------------------------------
  gds <- filter_hwe(
    data = gds,
    interactive.filter = interactive.filter,
    filter.hwe = filter.hwe.bk,
    strata = NULL,
    hw.pop.threshold = hw.pop.threshold.bk,
    midp.threshold = midp.threshold.bk,
    parallel.core = parallel.core,
    parameters = filters.parameters,
    path.folder = wf,
    verbose = verbose,
    internal = FALSE)


  # Final Sync GDS -----------------------------------------------------------
  if (verbose) message("\nPreparing output files...")
  markers.meta <- extract_markers_metadata(
    gds = gds,
    whitelist = TRUE)

  strata <- extract_individuals_metadata(
    gds = gds,
    # ind.field.select = c("INDIVIDUALS", "STRATA"),
    whitelist = TRUE)

  sync_gds(
    gds = gds,
    samples = strata$INDIVIDUALS,
    variant.id = markers.meta$VARIANT_ID
  )

  # FINAL PREP
  # filtered data folder
  path.folder <- generate_folder(
    f = wf,
    rad.folder = "filtered",
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # Whitelist
  write_rad(data = markers.meta,
            path = path.folder,
            filename = "whitelist.markers.tsv",
            tsv = TRUE,
            write.message = "standard",
            verbose = verbose)
  # blacklist
  bl <- extract_markers_metadata(gds = gds, blacklist = TRUE)
  if (nrow(bl) > 0) {
    write_rad(data = bl,
              path = path.folder,
              filename = "blacklist.markers.tsv",
              tsv = TRUE,
              write.message = "standard",
              verbose = verbose)
  }


  # writing the blacklist of id
  blacklist.id <- extract_individuals_metadata(gds = gds, blacklist = TRUE)
  if (nrow(blacklist.id) > 0) {
    write_rad(data = blacklist.id,
              path = path.folder,
              filename = "blacklist.id.tsv",
              tsv = TRUE,
              write.message = "standard",
              verbose = verbose)
  }

  # Generate new strata
  write_rad(data = strata,
            path = path.folder,
            filename = "strata.filtered.tsv",
            tsv = TRUE,
            write.message = "Writing the filtered strata: strata.filtered.tsv",
            verbose = verbose)


  # Statistics after filtering -------------------------------------------------
  if (verbose) message("\nGenerating statistics after filtering")

  # Individuals and markers stats
  i.m.stats <- generate_stats(
    gds = gds,
    individuals = TRUE,
    markers = TRUE,
    missing = TRUE,
    heterozygosity = TRUE,
    coverage = TRUE,
    allele.coverage = TRUE,
    mac = TRUE,
    snp.position.read = TRUE,
    snp.per.locus = TRUE,
    path.folder = path.folder,
    file.date = file.date,
    parallel.core = parallel.core,
    verbose = verbose
  )

  # genomic_converter-----------------------------------------------------------
  if (verbose) message("\nTransferring data to genomic converter...")
  res$output <- genomic_converter(
    data = gds,
    strata = strata,
    output = output,
    parallel.core = parallel.core,
    filename = filename,
    verbose = FALSE,
    path.folder = path.folder,
    parameters = filters.parameters,
    filter.common.markers = FALSE,
    filter.monomorphic = FALSE,
    internal = TRUE)
  if (is.null(res$output)) res$output <- "not selected"
  summary_gds(gds = gds, verbose = TRUE)
  res$gds <- gds
  return(res)
}#End filter_rad
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.