R/detect_duplicate_genomes.R

Defines functions detect_duplicate_genomes

Documented in detect_duplicate_genomes

# Detect duplicate genomes

#' @name detect_duplicate_genomes
#' @title Compute pairwise genome similarity or distance between individuals
#' to highligh potential duplicate individuals
#' @description The function can compute two methods
#' to highligh potential duplicate individuals.
#' \enumerate{
#' \item distance between individuals and/or
#' \item pairwise genome similarity
#' }

#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.

#' @param detect.duplicate.genomes (optional, logical) For use inside radiator pipelines.
#' Default: \code{detect.duplicate.genomes = TRUE}.


# @param subsample.markers (optional, integer) To speed up computation and rapidly
# test the function's arguments (e.g. using 200 markers).
# Default: \code{subsample.markers = NULL}.

# @param random.seed (integer, optional) For reproducibility, set an integer
# for randomness when argument \code{subsample.markers} is used.
# By default, a random number is generated and printed.
# Default: \code{random.seed = NULL}.

#' @param distance.method (character) Depending on input data, 2 different methods
#' are used (give similar results):
#' \itemize{
#' \item gds data The calculation is fast it's \code{SNPRelate::snpgdsIBS} under
#' the hood.
#' \item tidy data The distance measure uses \code{amap::Dist}.
#' This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary".
#' }
#' Using \code{distance.method = NULL} will not run this method.
#' Default: \code{distance.method = "manhattan"}. This is very fast
#' compared to the genome similarity method. It uses allele counts and the codes
#' are tailored for biallelic and multiallelic markers.

#' @param genome (logical) Computes pairwise genome similarity in parallel.
#' The proportion of the shared genotypes is averaged across shared markers between
#' each pairwise comparison. This method makes filtering easier because the
#' threshold is more intuitive with the plots produced, but it's much longer
#' to run, even in parallel, so better to run overnight.
#' Default: \code{genome = FALSE}.

#' @param threshold.common.markers (double, optional) When using the
#' pairwise genome similarity approach (\code{genome = TRUE}), using a threshold
#' will filter the pairwise comparison before generating the results. This is
#' usefull if the number of pairwise comparisons (n*(n-1)/2) is very large and
#' allows to reduce the number of false positive, when missing data is involved.
#' e.g. 2 samples might have 100% markers in common, but if they only have
#' 30% of their genotyped markers in common, this is a poor fit.
#' Default: \code{threshold.common.markers = NULL}.
#'
#' @param dup.threshold Default: \code{dup.threshold = 0}. Turn off filter.

#' @param blacklist.duplicates (optional, logical)
#' With \code{blacklist.duplicates = TRUE}, after visualization,
#' the user is asked to enter a threshold to filter out duplicates.
#' With default, \code{blacklist.duplicates = FALSE}, the function as no
#' interaction with user.

#' @inheritParams radiator_common_arguments

#' @return A list with potentially 8 objects:
#' \itemize{
#' \item \code{$distance }: results of the distance method.
#' \item \code{$distance.stats}: Summary statistics of the distance method.
#' \item \code{$pairwise.genome.similarity}: results of the genome method.
#' \item \code{$genome.stats}: Summary statistics of the genome method.
#' \item \code{$violin.plot.distance}: violin plot showing the distribution of pairwise distances.
#' \item \code{$manhattan.plot.distance}: same info different visual with manhattan plot.
#' \item \code{$violin.plot.genome}: violin plot showing the distribution of pairwise genome similarities.
#' \item \code{$manhattan.plot.genome}: same info different visual with manhattan plot.
#' \item \code{$blacklist.id.similar}: blacklisted duplicates.
#' }
#'
#' Saved in the working directory:
#' individuals.pairwise.dist.tsv, individuals.pairwise.distance.stats.tsv,
#' individuals.pairwise.genome.similarity.tsv, individuals.pairwise.genome.stats.tsv,
#' blackliste.id.similar.tsv, blacklist.pairs.threshold.tsv

#' @details
#' Strategically, run the default first (\code{distance.method},
#' no \code{genome})
#'
#' \strong{\code{distance.method} argument is fast, but...}
#'
#' you don't know if the observed comparison (close or distant)
#' is influenced by missing values/the number of markers in common
#' between the pair compared. This is something that needs to be considered.
#' Be suspicious of a \emph{distant outlier} from the same pop pairwise comparison,
#' and similarly, be suspicious of a \emph{close outlier} from a different pop
#' pairwise comparisons.
#'
#' If there is no outlier, don't bother running the function again with
#' (\code{genome = TRUE}).
#'
#'\strong{Relative distance}
#'
#' Is the normalized distance for your dataset (not calculated by strata). For
#' each individual, it's the distance divided by the maximum distance observed.
#' The range is limited between 0 and 1. Closer to 0 = the more similar and
#' closer to 1, the more distant.
#'
#' \strong{\code{genome = TRUE}}
#'
#' The function will run slower, but...
#' If you see outliers with the first run, take the time to run the function
#' with \code{genome = TRUE}. Because this option is much better at detecting
#' duplicated individuals and it also shows the impact of \strong{missingness}
#' or the number of \strong{shared markers} between comparisons.
#'
#' \emph{Your outlier duo could well be the result of one of the individual having
#' an extremely low number of genotypes...}


#' @export
#' @rdname detect_duplicate_genomes

#' @examples
#' \dontrun{
#' # First run and simplest way (if you have the tidy tibble):
#' dup <- radiator::detect_duplicate_genomes(data = "wombat_tidy.rad")
#'
#' # This will use by default:
#' # distance.method = "manhattan"
#' # genome = FALSE
#' # parallel.core = all my CPUs - 1
#'
#' # If you need a tidy tibble: use one of radiator \code{tidy_} function or
#' # \code{radiator::tidy_genomic_data}
#'
#'
#' # To view the manhattan plot:
#' dup$manhattan.plot.distance
#'
#' # to view the data stats
#' dup.data.stats <- dup$distance.stats
#'
#' # to view the data
#' dup.data <- dup$distance
#'
#' # Based on the look of the distribution using both manhattan and boxplot,
#' # I can filter the dataset to highlight potential duplicates.
#'
#' # To run the distance (with euclidean distance instead of the default manhattan,
#' # and also carry the second analysis (with the genome method):
#' dup <- radiator::tidy_genomic_data(
#'     data = wombat_tidy_object,
#'     strata = "wombat.strata.tsv",
#'     vcf.metadata = FALSE
#' ) %>%
#' radiator::detect_duplicate_genomes(
#'     data = .,
#'     distance.method = "euclidean",
#'     genome = TRUE
#'     )
#'
#' # to view the data of the genome data
#' dup.data <- dup$pairwise.genome.similarity
#'
#' # Based on the look of the distribution using both manhattan and boxplot,
#' # I can filter the dataset based on 98% of identical genotype proportion,
#' # to highlight potential duplicates:
#' dup.filtered <- dplyr::filter(.data = dup.data, PROP_IDENTICAL > 0.98)
#'
#' # Get the list of duplicates id
#' dup.list.names <- data.frame(INDIVIDUALS = unique(c(dup.filtered$ID1, dup.filtered$ID2)))
#' }

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

detect_duplicate_genomes <- function(
  data,
  interactive.filter = TRUE,
  detect.duplicate.genomes = TRUE,
  dup.threshold = 0,
  # subsample.markers = NULL,
  # random.seed = NULL,
  distance.method = "manhattan",
  genome = FALSE,
  threshold.common.markers = NULL,
  blacklist.duplicates = FALSE,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
) {

  # # Testing
  # interactive.filter = TRUE
  # detect.duplicate.genomes = TRUE
  # dup.threshold = 0
  # distance.method = "manhattan"
  # genome = FALSE
  # threshold.common.markers = NULL
  # blacklist.duplicates = FALSE
  # parallel.core = parallel::detectCores() - 1
  # verbose = TRUE
  # random.seed = NULL
  # path.folder = NULL
  # parameters = NULL
  # internal <- FALSE

  if (interactive.filter || detect.duplicate.genomes) {
    if (interactive.filter) {
      verbose <- TRUE
      blacklist.duplicates <- TRUE
    }

    # Cleanup-------------------------------------------------------------------
    radiator_function_header(f.name = "detect_duplicate_genomes", verbose = verbose)
    file.date <- format(Sys.time(), "%Y%m%d@%H%M")
    if (verbose) message("Execution date@time: ", file.date)
    old.dir <- getwd()
    opt.change <- getOption("width")
    options(width = 70)
    timing <- radiator_tic()
    on.exit(setwd(old.dir), add = TRUE)
    on.exit(options(width = opt.change), add = TRUE)
    on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
    on.exit(radiator_function_header(f.name = "detect_duplicate_genomes", start = FALSE, verbose = verbose), add = TRUE)
    res <- list() # New list to prepare for results

    # Function call and dotslist -------------------------------------------------
    rad.dots <- radiator_dots(
      func.name = as.list(sys.call())[[1]],
      fd = rlang::fn_fmls_names(),
      args.list = as.list(environment()),
      dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
      keepers = c("path.folder", "parameters", "random.seed", "internal"),
      verbose = FALSE
    )

    # Manage missing arguments ---------------------------------------------------
    if (missing(data)) rlang::abort("missing data argument")

    # Folders---------------------------------------------------------------------
    path.folder <- generate_folder(
      f = path.folder,
      rad.folder = "detect_duplicate_genomes",
      internal = internal,
      file.date = file.date,
      verbose = verbose)

    # write the dots file
    message("Function call and arguments stored in a file")
    write_rad(
      data = rad.dots,
      path = path.folder,
      filename = stringi::stri_join("radiator_detect_duplicate_genomes_args_", file.date, ".tsv"),
      tsv = TRUE,
      internal = internal,
      verbose = verbose
    )

    # Random seed ----------------------------------------------------------------
    if (is.null(random.seed)) random.seed <- sample(x = 1:1000000, size = 1)
    set.seed(random.seed)
    readr::write_lines(x = random.seed, file = file.path(path.folder, "random.seed"))
    if (verbose) message("File written: random.seed (", random.seed,")")

    # Detect format --------------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)

    if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
      radiator_packages_dep(package = "amap")

      # Tidy data
      data <- radiator::tidy_wide(data = data, import.metadata = FALSE)

      # Filter parameter file: generate and initiate
      filters.parameters <- radiator_parameters(
        generate = TRUE,
        initiate = TRUE,
        update = FALSE,
        parameter.obj = parameters,
        data = data,
        path.folder = path.folder,
        file.date = file.date,
        internal = internal,
        verbose = verbose)



      if (rlang::has_name(data, "GT_BIN")) {
        gt.field <- "GT_BIN"
      } else {
        gt.field <- "GT"
      }

      want <- c("MARKERS", "CHROM", "LOCUS", "POS", "STRATA", "INDIVIDUALS", gt.field)#, "REF", "ALT")
      data %<>% dplyr::select(tidyselect::any_of(want))

      # necessary steps to make sure we work with unique markers and not duplicated LOCUS
      if (rlang::has_name(data, "LOCUS") && !rlang::has_name(data, "MARKERS")) {
        data <- dplyr::rename(.data = data, MARKERS = LOCUS)
      }

      # Subsampling ----------------------------------------------------------------
      # if (!is.null(subsample.markers)) {
      #   # Set seed for random sampling
      #   if (is.null(random.seed)) {
      #     random.seed <- sample(x = 1:1000000, size = 1)
      #     set.seed(random.seed)
      #     message("Random seed used: ", random.seed)
      #   } else {
      #     set.seed(random.seed)
      #   }
      #   sample.markers <- dplyr::distinct(data, MARKERS) %>%
      #     dplyr::sample_n(tbl = ., size = subsample.markers) %>%
      #     readr::write_tsv(x = ., file = file.path(path.folder, stringi::stri_join("subsampled.markers_random.seed_", random.seed, ".tsv"))) %>%
      #     purrr::flatten_chr(.)
      #   data <- dplyr::filter(data, MARKERS %in% sample.markers)
      #   sample.markers <- NULL
      # }

      # strata
      strata <- dplyr::ungroup(data) %>%
        dplyr::distinct(STRATA, INDIVIDUALS)

      # Preparing data for comparisons ---------------------------------------------
      if (verbose) message("Preparing data for analysis")

      #Genotyped stats -------------------------------------------------------------
      n.markers <- dplyr::n_distinct(data$MARKERS)
      if (gt.field == "GT") {
        geno.stats <- dplyr::filter(data, GT != "000000") %>%
          dplyr::group_by(INDIVIDUALS) %>%
          dplyr::summarise(GENOTYPED_PROP = length(GT) / n.markers)
      } else {
        geno.stats <- dplyr::filter(data, !is.na(GT_BIN)) %>%
          dplyr::group_by(INDIVIDUALS) %>%
          dplyr::summarise(GENOTYPED_PROP = length(GT_BIN) / n.markers)
      }
      readr::write_tsv(
        x = geno.stats,
        file = file.path(path.folder, "genotyped.statistics.tsv"))

      # GT_BIN available
      # for biallelic data set, just need to keep
      if (!is.null(distance.method) & gt.field == "GT_BIN") {
        # input.prep <- dplyr::ungroup(data) %>%
        #   dplyr::select(MARKERS, INDIVIDUALS, n = GT_BIN) %>%
        #   dplyr::mutate(MARKERS_ALLELES = MARKERS) %>%
        #   dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)

        input.prep <- dplyr::ungroup(data) %>%
          dplyr::select(MARKERS, INDIVIDUALS, ALT = GT_BIN) %>%
          dplyr::mutate(
            REF = 2 - ALT,
            REF = as.integer(REF),
            ALT = as.integer(ALT)
          ) %>%
          rad_long(
            x = .,
            cols = c("MARKERS", "INDIVIDUALS"),
            names_to = "ALLELES",
            values_to = "n",
            variable_factor = FALSE
          ) %>%
          dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>%
          dplyr::select(-ALLELES) %>%
          dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)
      }

      # GT_BIN NOT available
      if (gt.field != "GT_BIN") {
        # Allele count
        missing.geno <- dplyr::select(.data = data, MARKERS, INDIVIDUALS, GT) %>%
          dplyr::filter(GT == "000000") %>%
          dplyr::select(-GT) %>%
          dplyr::mutate(MISSING = rep("blacklist", n()))

        if (verbose) message("Preparing data: calculating allele count")
        input.prep <- dplyr::select(data, MARKERS, INDIVIDUALS, GT) %>%
          radiator_future(
            .x = .,
            .f = allele_count,
            flat.future = "dfr",
            split.vec = TRUE,
            split.with = "MARKERS",
            split.chunks = 10L,
            parallel.core = parallel.core
          )

        if (nrow(missing.geno) > 0) {
          input.prep %<>% dplyr::anti_join(missing.geno, by = c("MARKERS", "INDIVIDUALS"))
        }

        input.prep %<>%
          dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, GT, sep = ".")) %>%
          dplyr::select(-GT) %>%
          dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS)

        missing.geno <- NULL # unused object
      }#end preparing data
    } else {
      if (data.type == "gds.file") {
        data <- radiator::read_rad(data, verbose = verbose)
        data.type <- "SeqVarGDSClass"
      }

      radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)


      # Filter parameter file: generate and initiate
      filters.parameters <- radiator_parameters(
        generate = TRUE,
        initiate = TRUE,
        update = FALSE,
        parameter.obj = parameters,
        data = data,
        path.folder = path.folder,
        file.date = file.date,
        verbose = verbose)

      # geno.stats

      geno.stats <- generate_stats(
        gds = data, # change to data...
        markers = FALSE,
        missing = TRUE,
        heterozygosity = FALSE,
        coverage = FALSE,
        allele.coverage = FALSE,
        exhaustive = FALSE,
        plot = FALSE,
        file.date = file.date,
        parallel.core = parallel.core,
        verbose = FALSE,
        path.folder = path.folder) %$% i.info %>%
        dplyr::mutate(GENOTYPED_PROP = 1 - MISSING_PROP, MISSING_PROP = NULL) #%>% dplyr::rename(POP_ID = STRATA)

      strata <- dplyr::distinct(geno.stats, INDIVIDUALS, STRATA)

      geno.stats %<>% dplyr::select(INDIVIDUALS, GENOTYPED_PROP )
      readr::write_tsv(
        x = geno.stats,
        file = file.path(path.folder, "genotyped.statistics.tsv"))
      genome <- FALSE
    }
    # Computing distance ---------------------------------------------------------
    if (!is.null(distance.method)) {
      message("Calculating ", distance.method, " distances between individuals...")

      res$distance <- distance_individuals(
        x = if (data.type == "tbl_df") {
          input.prep
        } else {
          data
        },
        strata = strata,
        distance.method = distance.method,
        parallel.core = parallel.core
      ) %>%
        dplyr::arrange(DISTANCE_RELATIVE) %>%
        dplyr::mutate(PAIRS = seq(from = 1, to = n(), by = 1)) %>%
        dplyr::arrange(PAIRS)
      input.prep <- NULL

      res$distance <- suppressWarnings(
        dplyr::bind_cols(
          res$distance,
          dplyr::select(res$distance, ID1, ID2, PAIRS) %>%
            dplyr::left_join(dplyr::rename(geno.stats, ID1 = INDIVIDUALS, ID1_G = GENOTYPED_PROP), by = "ID1") %>%
            dplyr::left_join(dplyr::rename(geno.stats, ID2 = INDIVIDUALS, ID2_G = GENOTYPED_PROP), by = "ID2") %>%
            rad_long(
              x = .,
              cols = c("ID1", "ID2", "PAIRS"),
              names_to = "GENOTYPED_MAX",
              values_to = "GENOTYPED_PROP",
              variable_factor = FALSE
            ) %>%
            dplyr::group_by(PAIRS) %>%
            dplyr::filter(GENOTYPED_PROP == max(GENOTYPED_PROP)) %>%
            dplyr::ungroup(.) %>%
            dplyr::distinct(PAIRS, .keep_all = TRUE) %>%
            dplyr::select(-GENOTYPED_PROP) %>%
            dplyr::mutate(GENOTYPED_MAX = dplyr::if_else(GENOTYPED_MAX == "ID1_G", ID1, ID2)) %>%
            dplyr::arrange(PAIRS) %>%
            dplyr::select(-ID1, -ID2, -PAIRS)
        )
      )


      # test <- res$distance
      readr::write_tsv(
        x = res$distance,
        file = file.path(path.folder, "individuals.pairwise.dist.tsv"),
        col_names = TRUE
      )

      # Stats
      message("Generating summary statistics")
      res$distance.stats <- res$distance %>%
        dplyr::summarise(
          MEAN = mean(DISTANCE_RELATIVE, na.rm = TRUE),
          MEDIAN = stats::median(DISTANCE_RELATIVE, na.rm = TRUE),
          SE = round(sqrt(stats::var(DISTANCE_RELATIVE, na.rm = TRUE)/length(stats::na.omit(DISTANCE_RELATIVE))), 2),
          MIN = round(min(DISTANCE_RELATIVE, na.rm = TRUE), 2),
          MAX = round(max(DISTANCE_RELATIVE, na.rm = TRUE), 2),
          QUANTILE25 = stats::quantile(DISTANCE_RELATIVE, 0.25), # quantile25
          QUANTILE75 = stats::quantile(DISTANCE_RELATIVE, 0.75)#, # quantile75
          # OUTLIERS_LOW = QUANTILE25 - (1.5 * (QUANTILE75 - QUANTILE25)), # outliers : below the outlier boxplot
          # OUTLIERS_HIGH = QUANTILE75 + (1.5 * (QUANTILE75 - QUANTILE25)) # outliers : higher the outlier boxplot
        ) %>%
        readr::write_tsv(
          x = .,
          file = file.path(path.folder, "individuals.pairwise.distance.stats.tsv"),
          col_names = TRUE
        )

      message("Generating plots")
      # violin plot
      res$violin.plot.distance <- ggplot2::ggplot(
        data = res$distance,
        ggplot2::aes(x = PAIRWISE, y = DISTANCE_RELATIVE, na.rm = TRUE)
      ) +
        ggplot2::geom_violin(trim = TRUE) +
        ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = "black") +
        ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
        ggplot2::labs(y = "Distance (relative)\n <- distant      close->") +
        ggplot2::labs(x = "Pairwise comparisons") +
        ggplot2::scale_y_reverse() +
        ggplot2::theme(
          # legend.position = "none",
          panel.grid.minor.x = ggplot2::element_blank(),
          panel.grid.major.x = ggplot2::element_blank(),
          # panel.grid.major.y = element_blank(),
          axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.x = ggplot2::element_blank(),
          axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
        )
      ggplot2::ggsave(
        filename = file.path(path.folder, "violin.plot.distance.pdf"),
        plot = res$violin.plot.distance,
        width = 20,
        height = 15,
        dpi = 200,
        units = "cm",
        useDingbats = FALSE
      )
      ggplot2::ggsave(
        filename = file.path(path.folder, "violin.plot.distance.png"),
        plot = res$violin.plot.distance,
        width = 20,
        height = 15,
        dpi = 200,
        units = "cm"
      )

      # Manhattan plot
      res$manhattan.plot.distance <- ggplot2::ggplot(
        data = res$distance,
        ggplot2::aes(x = PAIRWISE, y = DISTANCE_RELATIVE, colour = POP_COMP)
      ) +
        ggplot2::geom_jitter(alpha = 0.3) +
        ggplot2::labs(y = "Distance (relative)\n <- distant      close->") +
        ggplot2::labs(x = "Pairwise comparisons") +
        ggplot2::labs(colour = "Population comparisons") +
        ggplot2::scale_colour_manual(values = c("#0571b0", "black")) +
        ggplot2::scale_y_reverse() +
        ggplot2::theme(
          # legend.position = "none",
          panel.grid.minor.x = ggplot2::element_blank(),
          panel.grid.major.x = ggplot2::element_blank(),
          # panel.grid.major.y = element_blank(),
          axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.x = ggplot2::element_blank(),
          axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
        ) +
        ggplot2::theme_light()

      ggplot2::ggsave(
        filename = file.path(path.folder, "manhattan.plot.distance.pdf"),
        plot = res$manhattan.plot.distance,
        width = 20,
        height = 15,
        dpi = 200,
        units = "cm",
        useDingbats = FALSE
      )

      ggplot2::ggsave(
        filename = file.path(path.folder, "manhattan.plot.distance.png"),
        plot = res$manhattan.plot.distance,
        width = 20,
        height = 15,
        dpi = 200,
        units = "cm"
      )

    } # end distance method

    # Compute genome similarity -------------------------------------------------
    if (genome) {

      # If GT_BIN available, we need a new input.prep (not the same as dist method)
      if (gt.field == "GT_BIN") {
        input.prep <- dplyr::filter(.data = data, !is.na(GT_BIN))
      }

      # all combination of individual pair
      all.pairs <- utils::combn(unique(input.prep$INDIVIDUALS), 2, simplify = TRUE) %>%
        t %>%
        tibble::as_tibble(.) %>%
        magrittr::set_colnames(x = ., value = c("ID1", "ID2")) %>%
        dplyr::mutate(
          PAIRS = seq(from = 1, to = n(), by = 1),
          MARKERS_TOTAL = rep(n.markers, n()) # just n.markers works
        )

      # get the number of pairwise comp.
      number.pairwise <- nrow(all.pairs)
      message("Starting scan for duplicates")
      message("Pairwise comparisons: ", number.pairwise)
      if (!is.null(threshold.common.markers)) {
        message("Threshold to keep a pair: ", round(threshold.common.markers, 2))
      }
      keep.data <- TRUE
      if (number.pairwise > 100000) {
        if (verbose) message("    Time for coffee...")
        keep.data <- FALSE
      }

      # generate the file to print into
      pairwise.filename <- file.path(path.folder, "individuals.pairwise.genome.similarity.tsv")
      pairwise.genome.similarity <- tibble::tibble(
        PAIRS = as.numeric(),
        ID1 = as.character(),
        ID2 = as.character(),
        ID1_STRATA = as.character(),
        ID2_STRATA = as.character(),
        POP_COMP = as.character(),
        MARKERS_TOTAL = as.integer(),
        MARKERS_COMMON = as.integer(),
        MARKERS_COMMON_PROP = as.numeric(),
        MARKERS_MISSING = as.integer(),
        IDENTICAL = as.integer(),
        DIFFERENT = as.integer(),
        IDENTICAL_PROP = as.numeric(),
        IDENTICAL_PROP_TOTAL = as.numeric(),
        GENOTYPED_MIN = as.character(),
        GENOTYPED_MAX = as.character(),
        PAIRWISE = as.character(),
        METHOD = as.character()
      ) %>%
        readr::write_tsv(
          x = .,
          file = pairwise.filename,
          col_names = TRUE)

      blacklist.pairs.filename <- file.path(path.folder, "blacklist.pairs.threshold.tsv")
      blacklist.pairs <- tibble::tibble(
        PAIRS = as.numeric(),
        ID1 = as.character(),
        ID2 = as.character()) %>%
        readr::write_tsv(
          x = .,
          file =  blacklist.pairs.filename,
          col_names = TRUE)


      # number.pairwise <-  10
      # number.pairwise <-  1000
      # number.pairwise <-  2000
      # number.pairwise <-  5000
      # number.pairwise <-  20000
      # parallel.core <- 10

      # Optimizing cpu usage
      round.cpu <- 1
      if (number.pairwise > 200 * parallel.core) {
        round.cpu <- floor(number.pairwise / (200 * parallel.core))
      }

      res$pairwise.genome.similarity <- radiator_future(
        .x = all.pairs,
        .f = genome_similarity,
        flat.future = "dfr",
        split.vec = TRUE,
        split.with = NULL,
        split.chunks = parallel.core * round.cpu,
        parallel.core = parallel.core,
        data = input.prep,
        threshold.common.markers = threshold.common.markers,
        keep.data = keep.data,
        pairwise.filename = pairwise.filename,
        blacklist.pairs.filename = blacklist.pairs.filename
      )

      # as.integer is usually twice as light as numeric vector...
      # all.pairs$SPLIT_VEC <- as.integer(floor((parallel.core * round.cpu * (seq_len(number.pairwise) - 1) / number.pairwise) + 1))
      # res$pairwise.genome.similarity <- radiator_parallel(
      #   X = unique(all.pairs$SPLIT_VEC),
      #   FUN = genome_similarity,
      #   mc.preschedule = FALSE,
      #   mc.silent = FALSE,
      #   mc.cleanup = TRUE,
      #   mc.cores = parallel.core,
      #   all.pairs = all.pairs,
      #   data = input.prep,
      #   threshold.common.markers = threshold.common.markers,
      #   keep.data = keep.data,
      #   pairwise.filename = pairwise.filename,
      #   blacklist.pairs.filename = blacklist.pairs.filename
      # ) %>%
      #   dplyr::bind_rows(.)
      input.prep <- id.pairwise <- NULL # no longer needed

      if (verbose) message("Generating summary statistics")
      if (nrow(res$pairwise.genome.similarity) == 0) {
        data.import <- readr::read_tsv(file = pairwise.filename, col_types = "d____c___i__d___c_")
        new.pairwise <- nrow(data.import)
        res$genome.stats <- data.import %>%
          dplyr::summarise(
            MEAN = mean(IDENTICAL_PROP, na.rm = TRUE),
            MEDIAN = stats::median(IDENTICAL_PROP, na.rm = TRUE),
            SE = round(sqrt(stats::var(IDENTICAL_PROP, na.rm = TRUE)/length(stats::na.omit(IDENTICAL_PROP))), 2),
            MIN = round(min(IDENTICAL_PROP, na.rm = TRUE), 2),
            MAX = round(max(IDENTICAL_PROP, na.rm = TRUE), 2),
            QUANTILE25 = stats::quantile(IDENTICAL_PROP, 0.25), # quantile25
            QUANTILE75 = stats::quantile(IDENTICAL_PROP, 0.75)# quantile75
          )
      } else {
        new.pairwise <- nrow(res$pairwise.genome.similarity)
        # Stats
        res$genome.stats <- res$pairwise.genome.similarity%>%
          dplyr::summarise(
            MEAN = mean(IDENTICAL_PROP, na.rm = TRUE),
            MEDIAN = stats::median(IDENTICAL_PROP, na.rm = TRUE),
            SE = round(sqrt(stats::var(IDENTICAL_PROP, na.rm = TRUE)/length(stats::na.omit(IDENTICAL_PROP))), 2),
            MIN = round(min(IDENTICAL_PROP, na.rm = TRUE), 2),
            MAX = round(max(IDENTICAL_PROP, na.rm = TRUE), 2),
            QUANTILE25 = stats::quantile(IDENTICAL_PROP, 0.25), # quantile25
            QUANTILE75 = stats::quantile(IDENTICAL_PROP, 0.75)# quantile75
          )
      }
      discarded.pairs <- number.pairwise - new.pairwise
      if (discarded.pairs > 0) {
        message("Number of discarded pairs based on threshold: ", round(discarded.pairs, 2))
        message("    more details in: blacklist.pairs.threshold.tsv\n")
      } else {
        file.remove(blacklist.pairs.filename)
      }

      readr::write_tsv(
        x = res$genome.stats,
        file = file.path(path.folder, "individuals.pairwise.genome.stats.tsv"),
        col_names = TRUE
      )

      # Visualization ------------------------------------------------------------
      message("Generating the plots")

      if (nrow(res$pairwise.genome.similarity) == 0) {
        res$violin.plot.genome <- ggplot2::ggplot(
          data = data.import,
          ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, na.rm = TRUE))
        res$manhattan.plot.genome <- ggplot2::ggplot(
          data = data.import,
          ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, colour = POP_COMP,
                       size = MARKERS_MISSING))
      } else {
        res$violin.plot.genome <- ggplot2::ggplot(
          data = res$pairwise.genome.similarity,
          ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, na.rm = TRUE))

        res$manhattan.plot.genome <- ggplot2::ggplot(
          data = res$pairwise.genome.similarity,
          ggplot2::aes(x = PAIRWISE, y = IDENTICAL_PROP, colour = POP_COMP,
                       size = MARKERS_MISSING))
      }

      # violin plot
      res$violin.plot.genome <- res$violin.plot.genome +
        ggplot2::geom_violin(trim = TRUE) +
        ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = "black") +
        ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
        ggplot2::labs(y = "Genome similarity (proportion)") +
        ggplot2::labs(x = "Pairwise comparison") +
        ggplot2::theme(
          # legend.position = "none",
          panel.grid.minor.x = ggplot2::element_blank(),
          panel.grid.major.x = ggplot2::element_blank(),
          # panel.grid.major.y = element_blank(),
          axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.x = ggplot2::element_blank(),
          axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
        )
      ggplot2::ggsave(
        filename = file.path(path.folder, "violin.plot.genome.pdf"),
        plot = res$violin.plot.genome,
        width = 20, height = 15, dpi = 150, units = "cm", useDingbats = FALSE)

      # Manhattan plot
      res$manhattan.plot.genome <- res$manhattan.plot.genome +
        ggplot2::geom_jitter(alpha = 0.3) +
        ggplot2::labs(y = "Genome similarity (proportion)") +
        ggplot2::labs(x = "Pairwise comparisons") +
        ggplot2::labs(colour = "Population comparisons") +
        ggplot2::scale_colour_manual(values = c("#0571b0", "black")) +
        ggplot2::scale_size_area(name = "Markers missing", max_size = 6) +
        ggplot2::theme(
          # legend.position = "none",
          panel.grid.minor.x = ggplot2::element_blank(),
          panel.grid.major.x = ggplot2::element_blank(),
          # panel.grid.major.y = element_blank(),
          axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.x = ggplot2::element_blank(),
          axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
          axis.text.y = ggplot2::element_text(size = 8, family = "Helvetica")
        ) +
        ggplot2::theme_light()

      ggplot2::ggsave(
        filename = file.path(path.folder, "manhattan.plot.genome.pdf"),
        plot = res$manhattan.plot.genome,
        width = 20, height = 15, dpi = 150, units = "cm", useDingbats = FALSE)

      ggplot2::ggsave(
        filename = file.path(path.folder, "manhattan.plot.genome.png"),
        plot = res$manhattan.plot.genome,
        width = 20, height = 15, dpi = 200, units = "cm")



    } # end genome method

    # Removing duplicates ------------------------------------------------------
    if (blacklist.duplicates || interactive.filter) {
      check.mono <- FALSE

      message("\nInspect tables and figures to decide if some individual(s) need to be blacklisted")
      remove.id <- radiator_question(
        x = "    Do you need to blacklist individual(s) (y/n): ", answer.opt = c("y", "n"))

      if (remove.id == "y") {
        message("\n2 options to remove duplicates:")
        message("    1. threshold: using the figure you choose a threshold. It's more powerful to fully remove duplicates")
        message("    2. manually: the function generate a blacklist that you have to complete")
        message("    Note: not sure ? Use option 1, it's more powerful to fully remove duplicates")
        remove.dup <- radiator_question(
          x = "    Enter the option to remove duplicates (1/2): ", minmax = c(1,2))

        if (remove.dup == 2) {
          readr::write_tsv(
            x = tibble::tibble(INDIVIDUALS = as.character()),
            file = file.path(path.folder, "blacklist.id.similar.tsv"),
            append = FALSE, col_names = TRUE)
          message("    An empty blacklist file was generated: blacklist.id.similar.tsv")
          message("    Keep column name, just add the individual(s) to blacklist(s)")
          blacklist.id.similar <- "check blacklist.id.similar.tsv file"

          finished <- radiator_question(
            x = "    When finished filling the blacklist type `y`:",
            answer.opt = c("y", "Y", "yes", "Yes", "YES"))

          blacklist.id.similar <- readr::read_tsv(
            file = file.path(path.folder, "blacklist.id.similar.tsv"),
            col_types = "c")
          dup.threshold <- "blacklist generated manually"
        } else {# with threshold
          # message("Use the distance or genome analysis to blacklist duplicates ? (distance/genome): ")
          # analysis <- as.character(readLines(n = 1))

          if (data.type == "SeqVarGDSClass") {
            analysis <- "distance"
          } else {
            if (!is.null(distance.method) && genome) {
              analysis <- radiator_question(
                x = "\nChoose the analysis method to blacklist duplicates? (distance/genome): ",
                answer.opt = c("distance", "genome")
              )
            } else if (genome) {
              analysis <- "genome"
            } else {
              analysis <- "distance"
            }
          }

          if (analysis == "distance") {
            data.dup <-  "individuals.pairwise.dist.tsv"
          } else {
            data.dup <-  "individuals.pairwise.genome.similarity.tsv"
          }
          dup.threshold <- radiator_question(
            x = "\nEnter the threshold to remove duplicates: (between 0 and 1)", minmax = c(0, 1))

          message("\n2 options to remove duplicates involved in pairs from different strata/group:")
          message("    (the black points on the figure, above your threshold)")
          message("    1: blacklist both samples in the pair")
          message("    2: blacklist only 1 sample, based on missingness")
          diff.pop.remove <- radiator_question(
            x = "    Enter 1/2: ", minmax = c(1, 2))


          if (diff.pop.remove == 2) {
            diff.pop.remove <- FALSE
          } else {
            diff.pop.remove <- TRUE
          }
          blacklist.id.similar <- remove_duplicates(
            data = data.dup,
            stats = "genotyped.statistics.tsv",
            dup.threshold = dup.threshold,
            diff.pop.remove = diff.pop.remove,
            path.folder = path.folder) %$% blacklist.id
        }
        n.ind.blacklisted <- nrow(blacklist.id.similar)
      } else {
        dup.threshold <- 0L
        n.ind.blacklisted <- 0L
      }

      if (n.ind.blacklisted > 0) {
        check.mono <- TRUE
        if (verbose) message("Blacklisted individuals: ", n.ind.blacklisted, " ind.")
        if (verbose) message("    Filtering with blacklist of individuals")

        if (data.type == "tbl_df") {
          data  %<>% dplyr::filter(!INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS)
        } else {
          id.info <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
            dplyr::mutate(
              FILTERS = dplyr::if_else(
                INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS,
                "filter.individuals.duplicates", FILTERS
              )
            )
          update_radiator_gds(gds = data, node.name = "individuals.meta", value = id.info, sync = TRUE)

          # id.info <- extract_individuals_metadata(gds = data)
          # bl <- id.info %>% dplyr::filter(INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS) %>%
          #   dplyr::distinct(INDIVIDUALS) %>%
          #   dplyr::mutate(FILTER = "filter.individuals.duplicates")
          # bl.i <- update_bl_individuals(gds = data, update = bl)

          # id.info %<>% dplyr::filter(!INDIVIDUALS %in% blacklist.id.similar$INDIVIDUALS)
        }
        blacklist.id.similar <- NULL
      }
    } else {
      dup.threshold <- 0L
      n.ind.blacklisted <- 0L
      check.mono <- FALSE
    }# End blacklist.duplicates

    # updating parameters --------------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = FALSE,
      initiate = FALSE,
      update = TRUE,
      parameter.obj = filters.parameters,
      data = data,
      filter.name = "detect duplicate genomes",
      param.name = "dup.threshold",
      values = dup.threshold,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # results ------------------------------------------------------------------
    radiator_results_message(
      rad.message = stringi::stri_join("Detect duplicate genomes: ", dup.threshold),
      filters.parameters,
      internal,
      verbose
    )

    # MONOMORPHIC MARKERS --------------------------------------------------
    if (check.mono) {
      data <- filter_monomorphic(
        data = data,
        parallel.core = parallel.core,
        verbose = FALSE,
        parameters = filters.parameters,
        path.folder = path.folder,
        internal = FALSE)
    }
  }
  return(data)
}# end function detect_duplicate_genomes

# Internal nested functions: ---------------------------------------------------

# distance method --------------------------------------------------------------
#' @title Distance individuals
#' @description distance method
#' @rdname distance_individuals
#' @export
#' @keywords internal
distance_individuals <- function(
  x,
  strata = NULL,
  distance.method = "manhattan",
  parallel.core = parallel::detectCores() - 1
) {

  data.type <- radiator::detect_genomic_format(x)

  if (data.type == "tbl_df") {
    x <- dplyr::select(x, -MARKERS)
    n.ind <- dplyr::n_distinct(x$INDIVIDUALS)
    x <- suppressWarnings(
      dplyr::ungroup(x) %>%
        radiator::rad_wide(
          x = .,
          formula = "INDIVIDUALS ~ MARKERS_ALLELES",
          values_from = "n"
        ) %>%
        tibble::remove_rownames(.data = .) %>%
        tibble::column_to_rownames(.data = ., var = "INDIVIDUALS"))

    x <- suppressWarnings(
      amap::Dist(
        x = x,
        method = distance.method,
        nbproc = parallel.core)) %>%
      distance2tibble(.)# melt the dist matrice into a data frame
  } else {
    sample.id <- extract_individuals_metadata(
      gds = x,
      ind.field.select = "INDIVIDUALS",
      whitelist = TRUE) %$%
      INDIVIDUALS
    # x.bk <- x
    x <- SNPRelate::snpgdsIBS(
      gdsobj = x,
      autosome.only = FALSE,
      num.thread = parallel.core,
      remove.monosnp = TRUE,
      snp.id = extract_markers_metadata(
        gds = x, markers.meta.select = "VARIANT_ID", whitelist = TRUE) %$% VARIANT_ID,
      sample.id = sample.id,
      verbose = FALSE) %$% ibs

    # summary_gds(gds = x.bk)

    x <- 1 - x
    # magrittr::subtract(1) %>%
    x  %<>%
      magrittr::set_colnames(sample.id) %>%
      magrittr::set_rownames(sample.id) %>%
      distance2tibble(.)
    sample.id <- NULL
    distance.method <- "IBS"
  }


  # Include population info with strata
  ID1.pop <- suppressWarnings(
    dplyr::select(.data = x, INDIVIDUALS = ID1) %>%
      dplyr::inner_join(strata, by = "INDIVIDUALS") %>%
      dplyr::select(ID1_POP = STRATA))

  ID2.pop <- suppressWarnings(
    dplyr::select(.data = x, INDIVIDUALS = ID2) %>%
      dplyr::inner_join(strata, by = "INDIVIDUALS") %>%
      dplyr::select(ID2_POP = STRATA))

  x <- dplyr::bind_cols(x, ID1.pop, ID2.pop) %>%
    dplyr::mutate(
      POP_COMP = dplyr::if_else(ID1_POP == ID2_POP, "same pop", "different pop"),
      POP_COMP = factor(POP_COMP, levels = c("same pop", "different pop"), ordered = TRUE),
      PAIRWISE = rep("pairwise", n()),
      METHOD = rep(distance.method, n())
    )
  ID1.pop <- ID2.pop <- NULL
  return(x)
}#End distance_individuals


# pairwise genome similarity method---------------------------------------------
#' @title genome_similarity
#' @description for the parallel part
#' @rdname genome_similarity
#' @export
#' @keywords internal
genome_similarity <- carrier::crate(function(
  all.pairs,
  data = NULL,
  threshold.common.markers = NULL,
  keep.data = TRUE,
  pairwise.filename = NULL,
  blacklist.pairs.filename = NULL
) {
  `%>%` <- magrittr::`%>%`
  `%<>%` <- magrittr::`%<>%`

  # split.vec <- 20 # test
  # all.pairs %<>%
  #   dplyr::filter(SPLIT_VEC == split.vec) %>%
  #   dplyr::select(-SPLIT_VEC)

  genome_similarity_map <- function(
    pair,
    some.pairs = NULL,
    data = NULL,
    threshold.common.markers = NULL
  ) {
    `%>%` <- magrittr::`%>%`
    `%<>%` <- magrittr::`%<>%`

    # pair <- 1#test
    res <- list()
    res$genome.comparison <- dplyr::filter(some.pairs, PAIRS == pair) %>%
      dplyr::mutate(
        ID1_STRATA = unique(data$STRATA[data$INDIVIDUALS == ID1]),
        ID2_STRATA = unique(data$STRATA[data$INDIVIDUALS == ID2])
      )

    if (rlang::has_name(data, "GT_BIN")) {
      # genotypes & markers
      id1.data <- dplyr::filter(.data = data, INDIVIDUALS %in% res$genome.comparison$ID1) %>%
        dplyr::select(-INDIVIDUALS, -STRATA) %>% dplyr::arrange(MARKERS)
      id2.data <- dplyr::filter(.data = data, INDIVIDUALS %in% res$genome.comparison$ID2) %>%
        dplyr::select(-INDIVIDUALS, -STRATA) %>% dplyr::arrange(MARKERS)
      id1.n.geno <- nrow(id1.data)
      id2.n.geno <- nrow(id2.data)
      n.markers <- dplyr::n_distinct(data$MARKERS)
      data <- NULL

      markers.common <- length(dplyr::intersect(x = id1.data$MARKERS, y = id2.data$MARKERS))
      common.prop <- markers.common/n.markers
      keep.comp <- TRUE
      # threshold.common.markers <- 0.3#test
      if (!is.null(threshold.common.markers)) {
        keep.comp <- common.prop >= threshold.common.markers
      }

      if (keep.comp) {
        res$genome.comparison <-  res$genome.comparison %>%
          dplyr::mutate(
            POP_COMP = dplyr::if_else(ID1_STRATA == ID2_STRATA, "same pop", "different pop"),
            MARKERS_TOTAL = n.markers,
            MARKERS_COMMON = markers.common,
            MARKERS_COMMON_PROP = common.prop,
            MARKERS_MISSING = MARKERS_TOTAL - MARKERS_COMMON,
            IDENTICAL = nrow(dplyr::intersect(x = id1.data, y = id2.data)),
            DIFFERENT = MARKERS_COMMON - IDENTICAL,
            IDENTICAL_PROP = IDENTICAL / MARKERS_COMMON,
            IDENTICAL_PROP_TOTAL = IDENTICAL / MARKERS_TOTAL,
            GENOTYPED_MIN = dplyr::if_else(id1.n.geno < id2.n.geno, ID1, ID2),
            GENOTYPED_MAX = dplyr::if_else(id1.n.geno > id2.n.geno, ID1, ID2),
            PAIRWISE = "pairwise comparison",
            METHOD = "genome similarity") %>%
          dplyr::select(PAIRS, ID1, ID2, ID1_STRATA, ID2_STRATA,
                        POP_COMP, MARKERS_TOTAL, MARKERS_COMMON,
                        MARKERS_COMMON_PROP, MARKERS_MISSING, IDENTICAL,
                        DIFFERENT, IDENTICAL_PROP, IDENTICAL_PROP_TOTAL, GENOTYPED_MIN,
                        GENOTYPED_MAX, PAIRWISE, METHOD)
        res$blacklist.pairs.threshold <- NULL
        # res$blacklist.pairs.threshold <- tibble::tibble(
        #   PAIRS = as.numeric(),
        #   ID1 = as.character(),
        #   ID2 = as.character())
      } else {
        message("blacklisted pairs")
        res$blacklist.pairs.threshold <- dplyr::select(res$genome.comparison, PAIRS, ID1, ID2)
        res$genome.comparison <- NULL
      }

    } else {#using GT
      data %<>% dplyr::filter(INDIVIDUALS %in% list.pair & n != 0)

      # genotypes & markers
      id1.data <- dplyr::filter(.data = data, INDIVIDUALS %in% id1) %>%
        dplyr::select(-INDIVIDUALS)

      id2.data <- dplyr::filter(.data = data, INDIVIDUALS %in% id2) %>%
        dplyr::select(-INDIVIDUALS)

      # markers
      id1.markers <- dplyr::distinct(id1.data, MARKERS)
      id2.markers <- dplyr::distinct(id2.data, MARKERS)

      # Check the number of markers in common
      # number.common.markers <- nrow(dplyr::intersect(x = id1.markers, y = id2.markers))

      # if (is.null(common.markers.threshold)) common.markers.threshold <- number.common.markers

      # output comparison
      genome.comparison <- tibble::tibble(
        ID1 = id1,
        ID2 = id2,
        MARKERS_COMMON = nrow(dplyr::intersect(x = id1.markers, y = id2.markers)),
        IDENTICAL = nrow(dplyr::intersect(x = id1.data, y = id2.data) %>% dplyr::distinct(MARKERS)),
        DIFFERENT = MARKERS_COMMON - IDENTICAL,
        PROP_IDENTICAL = IDENTICAL / MARKERS_COMMON
      )
    }
    #unused objets:
    list.pair <- id1 <- id2 <- data <- id1.data <- id2.data <- NULL
    id1.n.geno <- id2.n.geno <- keep.comp <- markers.common <- common.prop <- NULL

    return(res)
  }#End genome_similarity_map

  genome.comparison <- purrr::map(
    .x = all.pairs$PAIRS,
    .f = genome_similarity_map,
    some.pairs = all.pairs,
    data = data,
    threshold.common.markers = threshold.common.markers)
  all.pairs <- NULL

  blacklist.pairs.threshold <- genome.comparison %>%
    purrr::map_df("blacklist.pairs.threshold")

  if (nrow(blacklist.pairs.threshold) > 0) {
    readr::write_tsv(
      x = blacklist.pairs.threshold,
      file = blacklist.pairs.filename,
      append = TRUE, col_names = FALSE)
  }

  genome.comparison %<>%
    purrr::map_df("genome.comparison") %>%
    readr::write_tsv( x = ., file = pairwise.filename, append = TRUE, col_names = FALSE)

  if (!keep.data) genome.comparison <- NULL
  blacklist.pairs.threshold <- NULL
  return(genome.comparison)
})#End genome_similarity

# calculate allele count in parallel -------------------------------------------
#' @title allele_count
#' @description to calculate allele count in parallel
#' @rdname allele_count
#' @export
#' @keywords internal
allele_count <- carrier::crate(function(x) {
  `%>%` <- magrittr::`%>%`
  `%<>%` <- magrittr::`%<>%`
  x %<>%
    dplyr::ungroup(.) %>%
    dplyr::select(MARKERS, INDIVIDUALS, GT) %>%
    dplyr::filter(GT != "000000") %>%
    dplyr::mutate(
      A1 = stringi::stri_sub(str = GT, from = 1, to = 3),
      A2 = stringi::stri_sub(str = GT, from = 4, to = 6)
    ) %>%
    dplyr::select(-GT) %>%
    radiator::rad_long(
      x = .,
      cols = c("INDIVIDUALS", "MARKERS"),
      names_to = "ALLELES",
      values_to = "GT"
    ) %>%
    dplyr::arrange(MARKERS, INDIVIDUALS, GT) %>%
    dplyr::count(x = ., INDIVIDUALS, MARKERS, GT) %>%
    dplyr::ungroup(.) %>%
    tidyr::complete(
      data = ., INDIVIDUALS, tidyr::nesting(MARKERS, GT), fill = list(n = 0))
  return(x)
})#End allele_count


# remove_duplicates  -----------------------------------------------------------

#' @name remove_duplicates
#' @title Read tidy genomic data file ending .rad
#' @description Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' To remove duplicate individuals based on threshold established from the
#' visualization figures.

#' @param data (path) The individual's pairwise data.
#' Default: \code{data = "individuals.pairwise.dist.tsv"}.
#' @param stats (path) The genotype statistics
#' Default: \code{stats = "genotyped.statistics.tsv"}.
#' @param dup.threshold (double) The threshold to filter out duplicates
#' Default: \code{dup.threshold = 0.25}.
#' @param diff.pop.remove Remove all individuals in pairs from different pop.
#' Both samples are potentially problems. With defautl, the function will not keep
#' one sample in the duplicate pair.
#' Default: \code{diff.pop.remove = TRUE}.

#' @return A list with blacklisted duplicates. Write the blacklist in the working
#' directory.
#' @export
#' @keywords internal
#' @rdname remove_duplicates
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

remove_duplicates <- function(
  data = "individuals.pairwise.dist.tsv",
  stats = "genotyped.statistics.tsv",
  dup.threshold = 0.25,
  diff.pop.remove = TRUE,
  path.folder = NULL
) {
  if (is.null(path.folder)) path.folder <- getwd()
  data <-  file.path(path.folder, data)
  stats <-  file.path(path.folder, stats)

  dup.filtered <- suppressWarnings(suppressMessages(readr::read_tsv(data))) %>%
    dplyr::mutate(ID1 = as.character(ID1), ID2 = as.character(ID2))

  if (rlang::has_name(dup.filtered, "DISTANCE_RELATIVE")) {
    dup.filtered <- dup.filtered %>%
      dplyr::filter(DISTANCE_RELATIVE < dup.threshold)
  } else {
    dup.filtered <- dup.filtered %>%
      dplyr::filter(PROP_IDENTICAL > dup.threshold)
  }


  if (nrow(dup.filtered) > 0) {
    dup.list.names <- tibble::tibble(INDIVIDUALS = c(dup.filtered$ID1, dup.filtered$ID2)) %>%
      dplyr::group_by(INDIVIDUALS) %>%
      dplyr::tally(.) %>%
      dplyr::ungroup(.) %>%
      dplyr::arrange(dplyr::desc(n)) %>%
      dplyr::distinct(INDIVIDUALS) %>%
      purrr::flatten_chr(.)

    geno.stats <- readr::read_tsv(stats, col_types = "cd") %>%
      dplyr::filter(INDIVIDUALS %in% dup.list.names)

    res <- list(blacklist.id = tibble::tibble(INDIVIDUALS = character(0)),
                whitelist.id = tibble::tibble(INDIVIDUALS = character(0)))

    for (i in dup.list.names) {
      # i <- dup.list.names[1]
      dups <- dplyr::filter(dup.filtered, ID1 %in% i | ID2 %in% i)
      dups <- sort(unique(c(dups$ID1, dups$ID2)))

      # find all duplicates associated with the network
      new.dups <- 0L
      while(length(new.dups) > 0) {
        new.dups <- dplyr::filter(dup.filtered, ID1 %in% dups | ID2 %in% dups)
        new.dups <- sort(unique(c(new.dups$ID1, new.dups$ID2)))
        new.dups <- purrr::keep(.x = new.dups, .p = !new.dups %in% dups)
        if (length(new.dups) > 0) {
          dups <- c(dups, new.dups)
        }
      }
      dups <- tibble::tibble(INDIVIDUALS = dups)

      if (nrow(res$blacklist.id) > 0) {
        dups <- dplyr::filter(dups, !INDIVIDUALS %in% res$blacklist.id$INDIVIDUALS)
      }

      if (nrow(dups) > 0) {

        diff.pop <- dup.filtered %>%
          dplyr::filter(ID1 %in% dups$INDIVIDUALS | ID2 %in% dups$INDIVIDUALS) %>%
          dplyr::distinct(POP_COMP) %>%
          dplyr::filter(POP_COMP == "different pop")

        if (diff.pop.remove && (nrow(diff.pop) > 0)) {
          blacklist.diff.pop <- dup.filtered %>%
            dplyr::filter(ID1 %in% dups$INDIVIDUALS | ID2 %in% dups$INDIVIDUALS) %>%
            dplyr::distinct(POP_COMP) %>%
            dplyr::filter(POP_COMP == "different pop")

          if (nrow(blacklist.diff.pop) > 0) {
            res$blacklist.id <- dplyr::bind_rows(res$blacklist.id, dups)
          }
          blacklist.diff.pop <- NULL
        } else {
          whitelist.id <- dups %>%
            dplyr::left_join(geno.stats, by = "INDIVIDUALS") %>%
            dplyr::filter(GENOTYPED_PROP == max(GENOTYPED_PROP)) %>%
            dplyr::sample_n(tbl = ., size = 1) %>% # make sure only 1 is selected
            dplyr::select(INDIVIDUALS)

          if (nrow(whitelist.id) > 0) res$whitelist.id <- dplyr::bind_rows(res$whitelist.id, whitelist.id)

          blacklist.id <- dplyr::filter(dups, !INDIVIDUALS %in% whitelist.id$INDIVIDUALS) %>%
            dplyr::select(INDIVIDUALS)

          if (nrow(blacklist.id) > 0) res$blacklist.id <- dplyr::bind_rows(res$blacklist.id, blacklist.id)
        }
        diff.pop <- NULL
      }
    }
    dups <- blacklist.id <- whitelist.id <- i <- new.dups <- NULL

    res$blacklist.id <- dplyr::distinct(res$blacklist.id, INDIVIDUALS)
    res$whitelist.id <- dplyr::distinct(res$whitelist.id, INDIVIDUALS)
    message("With threshold selected, ", nrow(res$blacklist.id) ," individual(s) blacklisted")
    readr::write_tsv(x = res$blacklist.id, file = file.path(path.folder, "blacklist.id.similar.tsv"))
    message("Written in the directory: blacklist.id.similar.tsv")

  } else {
    message("With threshold selected, the blacklist of duplicate individuals is empty")
    res <- list(blacklist.id = tibble::tibble(INDIVIDUALS = character(0)),
                whitelist.id = tibble::tibble(INDIVIDUALS = character(0)))
  }
  return(res)
} # End remove_duplicates
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.