R/dart.R

Defines functions read_dart

Documented in read_dart

# This file is for everything related to DArT
# There is one for VCF and one for GDS

# read_dart -------------------------------------------------------------------------
# Import, filter and transform a dart output file to different formats

#' @name read_dart

#' @title Read and tidy \href{http://www.diversityarrays.com}{DArT} output files.

#' @description Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users. The function generate a GDS object/file
#' and optionally, a tidy dataset using
#' \href{http://www.diversityarrays.com}{DArT} files. Read the \emph{details} section
#' to understand why it's better than dartR.

#' @param data (file) 6 files formats used by DArT are recognized by radiator.
#' Don't modify the DArT file, to do this, use the \code{strata} file/argument below.
#' The function can import files ending with \code{.csv} or \code{.tsv}.
#'
#' \enumerate{
#' \item \strong{1row}: Genotypes are in 1 row and coded (0, 1, 2, -).
#' \code{0 for 2 reference alleles REF/REF}, \code{1 for 2 alternate alleles ALT/ALT},
#' \code{2 for heterozygote REF/ALT}, \code{- for missing}.
#' \item \strong{2rows}: No genotypes. It's absence/presence, 0/1, of the REF and ALT alleles.
#' Sometimes called binary format.
#' \item \strong{counts}: No genotypes, It's counts/read depth for the REF and ALT alleles.
#' Sometimes just called count data. This should be the preferred file format,
#' because DArT output the coverage (read depth for each genotypes).
#' \item \strong{silico.dart}: SilicoDArT data. No genotypes, no REF or ALT alleles.
#' It's a file coded as absence/presence, 0/1, for the presence of sequence in
#' the clone id.
#' \item \strong{silico.dart.counts}: SilicoDArT data. No genotypes, no REF or ALT alleles.
#' It's a file coded as absence/presence, with counts for the presence of sequence in
#' the clone id.
#' \item \strong{dart.vcf}: For DArT VCFs, please use \code{\link{read_vcf}}.
#' }
#'
#' If you encounter a problem, sent me your data so that I can update
#' the function.

#' @param strata A tab delimited file or object with a minimum of 3 columns headers:
#' \enumerate{
#' \item \strong{TARGET_ID}: generated by DArT, it's made of integers.
#' For \code{1row} and \code{2rows} the \code{TARGET_ID} is usually the sample
#' name submitted to DArT.
#'
#'
#' If this is new to you, take a look at this function:
#' \code{\link{extract_dart_target_id}}, it's easier than opening your DArT file
#' in \code{MS EXCEL}. This will extract the TARGET_ID of your DArT file
#' you can then use it inside \code{MS EXCEL} and build the remaining columns.
#'
#'
#' \item \strong{INDIVIDUALS}: This is the time and place no name your sample correctly.
#'
#'
#' \item \strong{STRATA}: refers to any grouping of individuals. Usually,
#' your sampling sites or populations. Keep it simple, 3 or 4 letters, like
#' TAS for Tasmania, etc.
#'
#'
#' Silico DArT data is currently used to detect sex markers, so the \code{STRATA}
#' column should be filed with sex information: \code{M} or \code{F}.
#' }
#'
#'
#' @param tidy.dart (logical, optional) Generate a tidy dataset.
#' Default:\code{tidy.dart = FALSE}.

#' @param calibrate.alleles (optional, logical)
#' Default: \code{calibrate.alleles = TRUE}.
#' Documented in \code{\link[radiator]{calibrate_alleles}}.

#' @inheritParams tidy_genomic_data
#' @inheritParams radiator_common_arguments
#' @inheritParams filter_whitelist



#' @param ... (optional) To pass further argument for fine-tuning the function.

#' @return A radiator GDS file and tidy dataframe with several columns depending on DArT file:
#' \code{silico.dart:} A tibble with 5 columns: \code{CLONE_ID, SEQUENCE, VALUE, INDIVIDUALS, STRATA}.
#' This object is also saved in the directory (file ending with .rad).
#'
#' Common to \code{1row, 2rows and counts}: A GDS file is automatically generated.
#' To have a tidy tibble, the argument \code{tidy.dart = TRUE} must be used.
#'
#' \enumerate{
#' \item VARIANT_ID: generated by radiator and correspond the markers in integer.
#' \item MARKERS: generated by radiator and correspond to CHROM + LOCUS + POS separated by 2 underscores.
#' \item CHROM: the chromosome info, for de novo: CHROM_1.
#' \item LOCUS: the locus info.
#' \item POS: the SNP id on the LOCUS.
#' \item COL: the position of the SNP on the short read.
#' \item REF: the reference allele.
#' \item ALT: the alternate allele.
#' \item INDIVIDUALS: the sample name.
#' \item STRATA/POP_ID: populations id of the sample.
#' \item GT_BIN: the genotype based on the number of alternate allele in the genotype
#' (the count/dosage of the alternate allele). \code{0, 1, 2, NA}.
#' \item REP_AVG: the reproducibility average, output specific of DArT.
#' }
#' Other columns potentially in the tidy tibble:
#' \enumerate{
#' \item GT: the genotype in 6 digit format \emph{à la genepop}.
#' \item GT_VCF: the genotype in VCF format \code{0/0, 0/1, 1/1, ./.}.
#' \item GT_VCF_NUC: the genotype in VCF format, but keeping the nucleotide information.
#' \code{A/A, A/T, T/T, ./.}
#' \item AVG_COUNT_REF: the coverage for the reference allele, output specific of DArT.
#' \item AVG_COUNT_SNP: the coverage for the alternate allele, output specific of DArT.
#' \item READ_DEPTH: the number of reads used for the genotype (count data).
#' \item ALLELE_REF_DEPTH: the number of reads of the reference allele (count data).
#' \item ALLELE_ALT_DEPTH: the number of reads of the alternate allele (count data).
#' }
#'
#'
#' Written in the working directory:
#' \itemize{
#' \item The radiator GDS file
#' \item The DArT metadata information
#' \item The tidy DArT data
#' \item The strata file associated with this tidy dataset
#' \item The allele dictionary is a tibble with columns:
#' \code{MARKERS, CHROM, LOCUS, POS, REF, ALT}.
#' }

#' @details
#' More details on what is happening under the hood when you import the DArT file
#' in R:
#' \enumerate{
#' \item The DArT file is imported
#' \itemize{
#' \item DArT files should not be modify.
#' \item A lot of imports problems originates from files modifications,
#' a couple of common checks are done.
#' \item The format (1row, 2rows, counts, silico) is interpreted.
#' \item The number of target ids is checked.
#' }
#' \item The strata file file is imported:
#' \itemize{
#' \item This is the file that needs modifications.
#' \item This is here that you change the bad samples names.
#' \item Remove target ids (blacklist samples that you no longer want).
#' \item Change the order of the populations/sampling sites.
#' \item or alternatively, you can use the \code{pop.levels} argument
#' (see \emph{dots-dots-dots ...} in the advance mode section below)
#' }
#' \item The target ids between the DArT and strata files are verified and the files
#' are merged.
#' \item The data is inspected for duplicated names
#' \item DArT changed colnames in their files along the years, we tidy things:
#' \itemize{
#' \item colnames in camelcase are changed to snakecase
#' \item ALLELE_SEQUENCE is changed to SEQUENCE
#' \item TRIMMED_SEQUENCE is changed to SEQUENCE
#' \item CLUSTER_CONSENSUS_SEQUENCE is changed to SEQUENCE
#' \item Genomic metadata are named and or re-named based on the Variant Call Format Specification:CHROM, LOCUS, POS, COL, REF, ALT
#' }
#' \item With this function you have the option to tidy the DArT file:
#' \itemize{
#' \item What's that ?
#' \href{https://r4ds.hadley.nz/data-tidy.html}{R for Data Science: explanation}
#' \item It takes longer and you need more memory, but if you can allow it,
#' it's better for inspection and visualisation.
#' \item or you wait to filter the data and generate a tidy dataset with
#' \code{\link[radiator]{tidy_genomic_data}}
#' }
#' \item REF and ALT alleles are re-calibrated with \code{\link[radiator]{calibrate_alleles}}:
#' \itemize{
#' \item This is not optional
#' \item It takes longer than just reading the file like other software and packages, but
#' it's better.
#' }
#' }


#' @section Advance mode:
#'
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function:
#' \enumerate{
#' \item \code{whitelist.markers}: detailed in \code{\link[radiator]{filter_whitelist}}.
#' Defautl: \code{whitelist.markers = NULL}.
#' \item \code{missing.memory} (option, path)
#' This argument allows to erase genotypes that have bad statistics.
#' It's the path to a file \code{.rad} file that contains 3 columns:
#' \code{MARKERS, INDIVIDUALS, ERASE}. The file is produced by several radiator
#' functions. For DArT data, \code{\link[radiator]{filter_rad}} generate the file.
#' Defautl: \code{missing.memory = NULL}. Currently not used.
#' \item \code{path.folder}: (optional, path) To write output in a specific folder.
#' Default: \code{path.folder = NULL}. The working directory is used.
#' \item \code{pop.levels}: detailed in \code{\link[radiator]{tidy_genomic_data}}.
#' }


#' @export
#' @rdname read_dart

#' @examples
#' \dontrun{
#' clownfish.dart.tidy <- radiator::read_dart(
#'     data = "clownfish.dart.csv",
#'     strata = "clownfish.strata.tsv"
#'     )
#' }
#' @seealso \code{\link{extract_dart_target_id}}

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

read_dart <- function(
    data,
    strata,
    filename = NULL,
    tidy.dart = FALSE,
    calibrate.alleles = TRUE,
    verbose = TRUE,
    parallel.core = parallel::detectCores() - 1,
    ...
) {

  # # for testing
  # data = "dart_test_2rows.csv"
  # # strata = "strata_dart_test_2rows.tsv"
  # strata = "strata_dart_test_2rows_bl_id.tsv"
  # filename = NULL
  # verbose = TRUE
  # parallel.core = parallel::detectCores() - 1
  # whitelist.markers = NULL
  # missing.memory <- NULL
  # path.folder = NULL
  # internal <- FALSE
  # tidy.dart = FALSE
  # calibrate.alleles = TRUE
  # gt = NULL
  # gt.bin = NULL
  # gt.vcf = NULL
  # gt.vcf.nuc = NULL
  # pop.levels = NULL

  # Cleanup-------------------------------------------------------------------
  radiator_function_header(f.name = "read_dart", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Reproducibility "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Reproducibility ", width = 80L, pad = "-"), "\n")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
  on.exit(radiator_function_header(f.name = "read_dart", start = FALSE, verbose = verbose), add = TRUE)

  # Function call and dotslist -------------------------------------------------
  rad.dots <- radiator_dots(
    func.name = as.list(sys.call())[[1]],
    fd = rlang::fn_fmls_names(),
    args.list = as.list(environment()),
    dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
    keepers = c("whitelist.markers", "missing.memory",
                "path.folder", "internal", "pop.levels",
                "gt", "gt.bin", "gt.vcf", "gt.vcf.nuc",
                "tidy.check"
    ),
    verbose = FALSE
  )
  if (is.null(tidy.check)) tidy.check <- FALSE
  if (tidy.dart) {
    tidy.check <- TRUE
  } else {
    gt <- gt.vcf <- gt.vcf.nuc <- FALSE
  }

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("data is missing")


  # Generate folders and filenames ---------------------------------------------
  wf <- path.folder <- generate_folder(
    rad.folder = "read_dart",
    path.folder = path.folder,
    prefix.int = FALSE,
    internal = internal,
    file.date = file.date,
    verbose = verbose
  )

  # write the dots file
  write_radiator_tsv(
    data = rad.dots,
    path.folder = path.folder,
    filename = "radiator_tidy_dart_args",
    date = TRUE,
    internal = internal,
    write.message = "Function call and arguments stored in: ",
    verbose = verbose
  )


  filename.gds <- generate_filename(
    name.shortcut = filename,
    path.folder = path.folder,
    date = TRUE,
    extension = "gds.rad")$filename

  filename <- generate_filename(
    name.shortcut = filename,
    path.folder = path.folder,
    date = TRUE,
    extension = "arrow.parquet")$filename

  meta.filename <- generate_filename(
    name.shortcut = "radiator_tidy_dart_metadata",
    path.folder = path.folder,
    date = TRUE,
    extension = "tsv")$filename

  # Read TARGET_ID -------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Target ids "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Target ids ", width = 80L, pad = "-"), "\n")
  if (verbose) message("Extracting DArT TARGET_ID")
  target.id <- extract_dart_target_id(data, write = FALSE, metadata = TRUE)

  # Read Strata file -----------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Strata "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Strata ", width = 80L, pad = "-"), "\n")
  strata <- radiator::read_strata(
    strata = strata,
    pop.levels = pop.levels,
    pop.labels = NULL,
    pop.select = NULL,
    blacklist.id = NULL,
    keep.two = FALSE,
    verbose = verbose
  )
  pop.levels <- strata$pop.levels
  strata <- strata$strata

  # Checks  --------------------------------------------------------------------
  strata <- checks_dart_target(
    target.id = target.id,
    strata = strata,
    path.folder = path.folder,
    verbose = TRUE
  )
  target.id <- NULL

  # Read markers metadata ------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Markers "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Markers ", width = 80L, pad = "-"), "\n")
  markers.meta <- read_dart_markers_metadata(
    data = data,
    whitelist.markers = whitelist.markers,
    parallel.core = parallel.core,
    verbose = TRUE
  )
  variant.id <- markers.meta$variant.id
  markers.meta <- markers.meta$metadata

  # will save when recalibration is checked...below

  # Read Genotypes -------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Genotypes "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Genotypes ", width = 80L, pad = "-"), "\n")
  if (verbose) message("Importing DArT genotypes information from file...")

  # Check DArT format file
  dart.check <- detect_dart_format(data, verbose = FALSE)

  # split dataset by strata or cores
  strata.split <- split_vec(
    x = strata,
    chunks = parallel.core,
    tibble.split = FALSE,
    strata.split = TRUE
  )

  # Import
  cli::cli_progress_step(msg = "Big file...", msg_done = "done")
  genotypes <- purrr::pmap(
    .l = list(strata.split = strata.split),
    .f = import_dart_geno,
    variant.id = variant.id,
    data = data,
    dart.check = dart.check,
    parallel.core = parallel.core,
    .progress = TRUE
  ) %>%
    purrr::list_rbind(.) %>%
    dplyr::arrange(VARIANT_ID, ID_SEQ)
  cli::cli_progress_done()

  strata.split <- variant.id <- NULL

  # Calibration-----------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Calibration "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Calibration ", width = 80L, pad = "-"), "\n")

  if (calibrate.alleles) {
    switch <- detect_calibration_problem(x = genotypes, dart.check = dart.check)
    n.switch <- switch$n.switch
    switch <- switch$switch

    if (n.switch > 0) {
      markers.meta <- dart_calibration(switch = switch, metadata = markers.meta)
      genotypes <- dart_calibration(switch = switch, genotypes = genotypes)
    }#End if switching alleles
  } else {
    message("Calibration of alleles was turned off: not recommended")
  }


  # save markers metadata
  readr::write_tsv(
    x = markers.meta,
    file = meta.filename,
  )
  if (verbose) message("File written: ", basename(meta.filename))


  # Whitelist ------------------------------------------------------------------
  if (!is.null(whitelist.markers)) {
    genotypes %<>% dplyr::filter(VARIANT_ID %in% markers.meta$VARIANT_ID)
  }


  # Silico DArT ----------------------------------------------------------------
  if ("silico.dart" %in% dart.check$dart.format) {
    # Whitelist
    if (!is.null(whitelist.markers)) {
      data %<>% filter_whitelist(data = ., whitelist.markers = whitelist.markers)
    }

    want <- c("CLONE_ID", "SEQUENCE", strata$INDIVIDUALS)
    data %<>%
      dplyr::select(tidyselect::any_of(want)) %>%
      radiator::rad_long(
        x = .,
        cols = c("CLONE_ID", "SEQUENCE"),
        names_to = "INDIVIDUALS",
        values_to = "VALUE",
        variable_factor = FALSE
      )

    n.clone <- length(unique(data$CLONE_ID))
    data <- radiator::join_strata(data = data, strata = strata)

    filename <- generate_filename(
      name.shortcut = "radiator.silico.dart",
      path.folder = path.folder,
      date = TRUE,
      extension = "arrow.parquet"
    )$filename

    write_rad(
      data = data,
      filename = filename,
      write.message = "standard",
      verbose = verbose
    )


    if (verbose) message("################################### SUMMARY ####################################\n")
    message("\nNumber of clones: ", n.clone)
    summary_strata(strata)
    return(data)
  }

  # DArT genotyping ------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Normalization "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Normalization ", width = 80L, pad = "-"), "\n")

  # DArT genotyping strategy
  # All this can be overwritten in ... argument
  # gt.bin is the dosage of ALT allele: 0, 1, 2 NA
  # gt.bin <- TRUE #by default...
  n.markers <- nrow(markers.meta)

  dart.strategy <- dart_genotyping_strategy(
    n.markers = n.markers,
    gt.vcf = gt.vcf,
    gt.vcf.nuc = gt.vcf.nuc,
    gt = gt
  )

  # DArT genotyping
  genotypes <- dart_genotyper(
    x = genotypes,
    dart.check = dart.check,
    markers.metadata = markers.meta,
    gt.vcf = dart.strategy$gt.vcf,
    gt.vcf.nuc = dart.strategy$gt.vcf.nuc,
    gt = dart.strategy$gt
  )
  dart.strategy <- NULL

  # dart2gds -------------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Format: GDS "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Format: GDS ", width = 80L, pad = "-"), "\n")

  # Generate GDS
  dart.gds <- radiator_gds(
    genotypes = gt2array(
      genotypes = genotypes, n.ind = nrow(strata), n.snp = n.markers
    ),
    biallelic = TRUE,
    data.source = dart.check$data.type,
    strata = strata,
    markers.meta = markers.meta,
    genotypes.meta = genotypes,
    filename = filename.gds,
    open = TRUE,
    verbose = verbose
  )

  if (verbose) message("done!")

  # no longer relevant to have this function...
  # dart.gds <- dart2gds(
  #   genotypes = genotypes,
  #   strata = strata,
  #   markers.meta = markers.meta,
  #   filename.gds = filename.gds,
  #   dart.check = dart.check,
  #   parallel.core = parallel.core,
  #   verbose = verbose
  # )

  n.chrom <- length(unique(markers.meta$CHROM))
  n.locus <- length(unique(markers.meta$LOCUS))
  n.snp <- n.markers

  # Tidy data -----------------------------------------------------------------
  if (tidy.dart) {
    # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Format: tidy "), width = 80L, pad = "-"), "\n"))
    if (verbose) message(stringi::stri_pad_both(str = " Format: tidy ", width = 80L, pad = "-"), "\n")
    if (verbose) message("Generating tidy data...")
    want <- c("VARIANT_ID", "M_SEQ", "VARIANT_ID","MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT")
    notwanted <- c("REF_COUNT", "ALT_COUNT", "M_SEQ", "REF", "ALT")
    genotypes %<>% dplyr::select(!dplyr::any_of(notwanted))

    dart.tidy <- markers.meta %>%
      dplyr::select(tidyselect::any_of(want)) %>%
      dplyr::left_join(genotypes, by = "VARIANT_ID") %>%
      dplyr::arrange(VARIANT_ID, ID_SEQ)
    want <- notwanted <- NULL

    # Final strata ---------------------------------------------------------
    if (!is.null(strata)) {
      strata.filename <- generate_filename(
        name.shortcut = "radiator_tidy_dart_strata",
        path.folder = path.folder,
        date = TRUE,
        extension = "tsv"
      )$filename

      # strata <- extract_individuals_metadata(gds = data, whitelist = TRUE)
      readr::write_tsv(x = strata, file = strata.filename)

      # if (rlang::has_name(dart.tidy, "TARGET_ID")) {
      common.col <- intersect(colnames(dart.tidy), colnames(strata))
      dart.tidy %<>%
        dplyr::left_join(
          dplyr::select(strata, ID_SEQ, INDIVIDUALS, STRATA)
          , by = common.col
        )
    }


    # Erase genotypes
    # ERASE <- NULL
    # if (!is.null(missing.memory)) {
    #   message("Using missing.memory file to erase genotypes")
    #   missing <- radiator::read_rad(data = missing.memory) %>%
    #     dplyr::arrange(MARKERS, INDIVIDUALS)
    #
    #   data %<>% dplyr::arrange(MARKERS, INDIVIDUALS)
    #
    #   #check identical markers
    #   same.markers <- identical(unique(missing$MARKERS), unique(data$MARKERS))
    #   same.individuals <- identical(unique(missing$INDIVIDUALS), unique(data$INDIVIDUALS))
    #   if (!same.markers || !same.individuals) {
    #     message("note: data and missing memory don't share all the same markers and/or individuals")
    #     data %<>% dplyr::left_join(missing, by = c("MARKERS", "INDIVIDUALS"))
    #     #%>% dplyr::mutate(ERASE = replace(ERASE, which(is.na(ERASE)), FALSE))
    #     data$ERASE[is.na(data$ERASE)] <- FALSE # faster
    #     which.missing <- which(data$ERASE)
    #     data %<>% dplyr::select(-ERASE)
    #   } else {
    #     which.missing <- which(missing$ERASE)
    #   }
    #
    #   message("Erasing genotypes and genotypes metadata...")
    #   if (rlang::has_name(data, "GT_BIN")) data$GT_BIN[which.missing] <- NA
    #   if (rlang::has_name(data, "GT")) data$GT[which.missing] <- "000000"
    #   if (rlang::has_name(data, "GT_VCF")) data$GT_VCF[which.missing] <- "./."
    #   if (rlang::has_name(data, "GT_VCF_NUC")) data$GT_VCF_NUC[which.missing] <- "./."
    #   if (rlang::has_name(data, "READ_DEPTH")) data$READ_DEPTH[which.missing] <- NA
    #   if (rlang::has_name(data, "ALLELE_REF_DEPTH")) data$ALLELE_REF_DEPTH[which.missing] <- NA
    #   if (rlang::has_name(data, "ALLELE_ALT_DEPTH")) data$ALLELE_ALT_DEPTH[which.missing] <- NA
    # }#End missing.memory


    # Write tidy --------
    write_rad(
      data = dart.tidy,
      filename = filename,
      verbose = verbose
    )

  }#tidy

  # summary dart ---------------------------------------------------------------
  # if (verbose) cat(paste0(stringi::stri_pad_both(str = paste0(" Summary "), width = 80L, pad = "-"), "\n"))
  if (verbose) message(stringi::stri_pad_both(str = " Summary ", width = 80L, pad = "-"), "\n")
  message("Number of chrom: ", n.chrom)
  message("Number of locus: ", n.locus)
  message("Number of SNPs: ", n.snp)
  summary_strata(strata)
  if (tidy.dart) {
    return(dart.tidy)
  } else {
    return(dart.gds)
  }
}#End read_dart


# extract_dart_target_id---------------------------------------------------------
#' @name extract_dart_target_id

#' @title Extract \href{http://www.diversityarrays.com}{DArT} target id

#' @description Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for some users. The function allows to extract DArT
#' target id from a DArT file. To help prepare the appropriate STRATA file.
#' You can also decide if you want the samples metadata.

#' @inheritParams read_dart

#' @param write (logical) With default \code{write = TRUE}, the dart target id column is
#' written in a file in the working directory.


#' @param metadata (logical) With default \code{metadata = FALSE}, the dart
#' target id and the sample metadata are extracted from the dart file and converted
#' into a tidy data frame.
#' \itemize{
#' \item \code{DART_NUMBER}: The DArT order or service number
#' \item \code{DART_PLATE_BARCODE}: The DArT plate barcode
#' \item \code{CLIENT_BARCODE}: The client plate barcode if provided
#' \item \code{WELL_ROW}: The well row position (A, B, C, D, E, F, G, H)
#' \item \code{WELL_COL}: The well column position (1 to 12)
#' \item \code{SAMPLE_COMMENTS}: The client sample comment if provided
#' \item \code{TARGET_ID}: Depending on the DArT file type, the target id
#' generated by DArT or sample info provided by the client.
#' \item \code{IMPORTANT NOTE:} DArT is not consistent with the output = always
#' verify the columns.
#' }

#' @return A tidy dataframe with a \code{TARGET_ID} column and metadata if requested.
#' For cleaning, the \code{TARGET_ID}
#' column is treated like the column \code{INDIVIDUALS}. Spaces and \code{,}
#' are removed, \code{_} and \code{:} are changed to a dash \code{-} and
#' UPPER case is used.
#' \href{https://thierrygosselin.github.io/radiator/reference/clean_ind_names.html}{see cleaning doc for logic behind this}.

#' @export
#' @rdname extract_dart_target_id

#' @examples
#' \dontrun{
#' # Built a strata file:
#' strata <- radiator::extract_dart_target_id("mt.dart.file.csv") %>%
#'     dplyr::mutate(
#'         INDIVIDUALS = "new id you want to give",
#'         STRATA = "fill this"
#'     ) %>%
#'     readr::write_tsv(x = ., file = "my.new.dart.strata.tsv")
#' }

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

extract_dart_target_id <- function(data, write = TRUE, metadata = FALSE) {
  ##TEST
  # write = FALSE
  # write = TRUE
  # metadata = FALSE
  # metadata = TRUE

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("Input file missing")

  # Check DArT -----------------------------------------------------------------
  # format file
  dart.check <- detect_dart_format(data)

  # Import data ----------------------------------------------------------------
  no.metadata <- dart.check$skip.number == 1
  if (no.metadata && metadata) metadata <- FALSE
  if (metadata) {
    matadata.keeper <- (dart.check$skip.number + 1L)
    if (matadata.keeper > 7) metadata <- FALSE # don't know what DArT did
    if (matadata.keeper < 6) metadata <- FALSE # don't know what DArT did
  }
  if (metadata) {
    if (matadata.keeper == 6) metadata.colnames <- c("DART_NUMBER", "DART_PLATE_BARCODE", "CLIENT_BARCODE", "WELL_ROW", "WELL_COL", "TARGET_ID")
    if (matadata.keeper == 7) metadata.colnames <- c("DART_NUMBER", "DART_PLATE_BARCODE", "CLIENT_BARCODE", "WELL_ROW", "WELL_COL", "SAMPLE_COMMENTS", "TARGET_ID")

    dart.target.id <- readr::read_delim(
      file = data,
      delim = dart.check$tokenizer.dart,
      skip = 0,
      n_max = matadata.keeper,
      na = "-",
      col_names = FALSE,
      col_types = readr::cols(.default = readr::col_character())
    ) %>%
      t %>% # Transpose the data
      magrittr::set_colnames(metadata.colnames) %>% # Set column names from the vector
      tibble::as_tibble(.) # Convert to tibble

    if (dart.check$star.number > 0) {
      dart.target.id  %<>% dplyr::filter(dplyr::row_number() > dart.check$star.number)
    }

    well.row.check <- all(stringi::stri_detect_regex(str = unique(dart.target.id$WELL_ROW), pattern = "[[A-Z]]"))
    well.col.check <- all(stringi::stri_detect_regex(str = as.integer(unique(dart.target.id$WELL_COL)), pattern = "[[0-9]]"))

    if (all(!well.row.check, !well.col.check)) {
      message("Verify: potential problem in columns names and ordering... ")
    } else {
      if (!well.row.check) message("To do: verify WELL_ROW column")
      if (!well.col.check) message("To do: verify WELL_COL column")
    }

  } else {
    dart.target.id <- readr::read_delim(
      file = data,
      delim = dart.check$tokenizer.dart,
      skip = dart.check$skip.number,
      n_max = 1,
      na = "-",
      col_names = FALSE,
      col_types = readr::cols(.default = readr::col_character())) %>%
      t %>%
      magrittr::set_colnames("TARGET_ID") %>%
      tibble::as_tibble(.)

    if (dart.check$star.number > 0) {
      dart.target.id %<>%
        dplyr::filter(dplyr::row_number() > dart.check$star.number)
    } else {
      # This is a string of known DArT col header not wanted
      discard <- c(
        "TARGET_ID", "ALLELE_ID", "CLONE_ID", "CLUSTER_TEMP_INDEX", "ALLELE_SEQUENCE",
        "CLUSTER_CONSENSUS_SEQUENCE", "CLUSTER_SIZE", "ALLELE_SEQ_DIST", "SNP",
        "SNP_POSITION", "CALL_RATE", "ONE_RATIO_REF", "ONE_RATIO_SNP", "FREQ_HOM_REF",
        "FREQ_HOM_SNP", "FREQ_HETS", "PIC_REF", "PIC_SNP", "AVG_PIC", "AVG_COUNT_REF",
        "AVG_COUNT_SNP", "RATIO_AVG_COUNT_REF_AVG_COUNT_SNP",
        "FREQ_HETS_MINUS_FREQ_MIN_HOM",
        "ALLELE_COUNTS_CORRELATION", "AGGREGATE_TAGS_TOTAL", "DERIVED_CORR_MINUS_SEED_CORR",
        "REP_REF", "REP_SNP", "REP_AVG", "PIC_REP_REF", "PIC_REP_SNP", "TOTAL_PIC_REP_REF_TEST",
        "TOTAL_PIC_REP_SNP_TEST", "BIN_ID", "BIN_SIZE", "ALLELE_SEQUENCE_REF",
        "ALLELE_SEQUENCE_SNP", "TRIMMED_SEQUENCE_REF", "TRIMMED_SEQUENCE", "ONE_RATIO",
        "PIC", "AVG_READ_DEPTH", "STDEV_READ_DEPTH", "Q_PMR", "REPRODUCIBILITY", "MAF",
        "TOT_COUNTS", "TOTAL_PIC_REP_TEST", "PIC_REP")

      discard.genome <- c("CHROM_|CHROM_POS_|ALN_CNT_|ALN_EVALUE_")

      dart.target.id %<>%
        dplyr::filter(!radiator_snakecase(x = TARGET_ID) %in% discard) %>%
        dplyr::filter(!stringi::stri_detect_regex(
          str = stringi::stri_trans_toupper(TARGET_ID),
          pattern = discard.genome, negate = FALSE)) %>%
        dplyr::filter(!stringi::stri_detect_regex(
          str = radiator_snakecase(TARGET_ID),
          pattern = discard.genome, negate = FALSE))
    }
  }

  dart.target.id %<>%
    dplyr::mutate(
      TARGET_ID = clean_ind_names(x = TARGET_ID),
      TARGET_ID = stringi::stri_trans_toupper(TARGET_ID)
    )
  if (write) readr::write_tsv(x = dart.target.id, file = "dart.target.id.tsv")

  # Check DArT as good target id written ---------------------------------------
  n.target.id <- nrow(dart.target.id)

  if (n.target.id != length(unique(dart.target.id$TARGET_ID))) {
    rlang::warn(
      "Non unique TARGET_ID or sample names used in the DArT file.
Solution: edit manually")
  }

  message("Number of individuals in DArT file: ", n.target.id)
  return(dart.target.id)
}#End extract_dart_target_id



# checks_dart_target------------------------------------------------------------
#' @title checks_dart_target
#' @description Checks target.id and strata for DArT potential problems.
#' @rdname checks_dart_target
#' @keywords internal
#' @export

checks_dart_target <- function(
    target.id,
    strata,
    path.folder = NULL,
    verbose = TRUE
) {

  # checks....
  # target_id

  n.ind.dart <- nrow(target.id)

  # Check for duplicate names in DArT target ids... yes happening all the time
  duplicate.id.data <- length(target.id$TARGET_ID) - length(unique(target.id$TARGET_ID))

  if (duplicate.id.data > 0) {
    message("Number of duplicated TARGET_ID in DArT file: ", duplicate.id.data)
    rlang::abort("Did you modify the DArT file ?\nYES: Revert back to the original DArT file because you made a mistake.\nNO: Don't have much choice: modify the DArT file to have unique TARGET_ID columns")
  }

  # strata
  n.ind.strata <- nrow(strata)

  # Check for duplicate names in the strata... yes happening all the time
  duplicate.id.strata <- length(strata$INDIVIDUALS) - length(unique(strata$INDIVIDUALS))

  if (duplicate.id.strata > 0) {
    message("Duplicated individuals names found in the strata.\n   number of duplicate names = ", duplicate.id.strata, "\n")
    rlang::abort("\nFix the strata with unique names and\nverify the DArT file for the same issue, adjust accordingly...")
  }

  # for speed
  strata %<>%
    dplyr::mutate(
      ID_SEQ = seq_len(length.out = dplyr::n()),
      STRATA_SEQ = as.integer(factor(x = STRATA, levels = levels(STRATA)))
    )

  # check TARGET_ID in strata match TARGET_ID in the DArT file
  if (n.ind.dart != n.ind.strata) {
    if (verbose) message("\nDifferent number of samples between strata and DArT file")
    if (verbose) message("Using individuals in strata file to filter individuals in DArT file")
  }
  # strata.id.check <- strata %>%
  # dplyr::mutate(IN_DART = strata$TARGET_ID %in% target.id$TARGET_ID)
  # strata.id.pass <- !FALSE %in% (unique(strata.id.check$IN_DART))

  # changed the way it look for blacklisted samples...


  # blacklist from strata
  bl.strata <- dplyr::filter(strata, !TARGET_ID %in% target.id$TARGET_ID) %>%
    dplyr::mutate(COMMENT = "missing from strata")
  n.bl.strata <- nrow(bl.strata)
  if (n.bl.strata > 0) {
    message("\nBlacklisted samples from the strata: ", n.bl.strata)
  } else {
    bl.strata <- NULL
    if (verbose) message("\nBlacklisted samples from the strata: 0")
  }

  # blacklist from DArT file (samples removed from DArT file)
  bl.data <- dplyr::filter(target.id, !TARGET_ID %in% strata$TARGET_ID) %>%
    dplyr::mutate(COMMENT = "missing from DArT file")
  n.bl.data <- nrow(bl.data)

  if (n.bl.data > 0) {
    message("Blacklisted samples from DArT file: ", n.bl.data)
  } else {
    bl.data <- NULL
    if (verbose) message("Blacklisted samples from DArT file: 0")
  }

  # Combined blacklists
  n.id.bl <- n.bl.strata + n.bl.data

  if (n.id.bl > 0) {
    message("Total number of blacklisted samples: ", n.id.bl)
    # writing the blacklist of id
    blacklist.id <- dplyr::bind_rows(bl.strata, bl.data) %>%
      dplyr::distinct(TARGET_ID, .keep_all = TRUE) %>%
      dplyr::select_if(~ !any(is.na(.)))

    blacklist.individuals.file <- radiator::generate_filename(
      name.shortcut = "blacklist.individuals",
      path.folder = path.folder,
      extension = "tsv",
      date = TRUE
    )$filename

    readr::write_tsv(
      x = blacklist.id,
      file = blacklist.individuals.file
    )
    if (verbose) message("File written: ",  basename(blacklist.individuals.file))


    blacklist.id <- blacklist.id$TARGET_ID

    # update the strata
    strata %<>% dplyr::filter(!TARGET_ID %in% blacklist.id)

    if (verbose) message("\nNote: Careful if using DArT internal statistics generated for all samples...")
    strata.id.check <- NULL
  } else {
    if (verbose) message("Total number of blacklisted samples: ", n.id.bl)
    blacklist.id <- NULL
  }

  if (n.id.bl == n.ind.dart) {
    rlang::abort("\nAll samples blacklisted ... verify the strata and DArT target ids")
  }
  # Not the same
  # !identical(sort(target.id$TARGET_ID), sort(strata$TARGET_ID)))
  n.ind.dart <- n.bl.strata <- n.id.bl <- n.bl.data <- bl.strata <- bl.data <- target.id <- NULL

  return(strata)
}#End checks_dart_target



# read_dart_markers_metadata ---------------------------------------------------
#' @title read_dart_markers_metadata
#' @description Read DArT markers/locus metadata.
#' @rdname read_dart_markers_metadata
#' @keywords internal
#' @export

read_dart_markers_metadata <- function(
    data,
    whitelist.markers = NULL,
    parallel.core = parallel::detectCores() - 1,
    verbose = TRUE
) {
  dart.check <- detect_dart_format(data, verbose = FALSE)

  message("Importing DArT markers metadata")

  if (dart.check$star.number > 0) {
    metadata.ncol <- dart.check$star.number
  } else {
    rlang::abort("Contact author: problem with star number in DArT file")
  }

  metadata <- suppressWarnings(
    vroom::vroom(
      file = data,
      delim = dart.check$tokenizer.dart,
      col_select = 1:metadata.ncol,
      col_names = TRUE,
      skip = dart.check$skip.number,
      na = c("-", "NA",""),
      guess_max = 20,
      altrep = TRUE,
      skip_empty_rows = TRUE,
      trim_ws = FALSE,
      num_threads = parallel.core,
      progress = FALSE,
      show_col_types = FALSE
    )
  )
  if (dart.check$dart.format == "2rows" || dart.check$dart.format == "counts") {
    n.markers <- nrow(metadata) / 2
    variant.id <- rep(1:n.markers, times = 1, each = 2)
  } else {
    n.markers <- nrow(metadata)
    variant.id <- rep(1:n.markers, times = 1)
  }
  metadata %<>%  tibble::add_column(VARIANT_ID = variant.id, .before = 1)
  # keep a non filter vector for import genotypes FAKE# keep a non filter vector for import genotypes later...


  # clean the headers --------------------------------
  if (verbose) message("DArT markers naming normalization")
  metadata <- clean_dart_colnames(
    x = metadata,
    dart.check = dart.check,
    blacklist.id = NULL,
    strata = NULL
  )

  # Check for problematic DArT 2 rows and counts
  # When modified, this is a common problem found ...
  unique.variant.id <- unique(metadata$VARIANT_ID)
  n.markers <- length(unique.variant.id)
  if (dart.check$dart.format == "2rows" || dart.check$dart.format == "counts") {
    if (n.markers != nrow(metadata) / 2) {
      message("\n\nProblem with DArT file")
      path.folder <- getwd()
      prob.file <- file.path(path.folder, "dart_problem.tsv")

      check <- metadata %>%
        dplyr::count(MARKERS, LOCUS) %>%
        dplyr::filter(n > 2) %>%
        dplyr::select(LOCUS) %>%
        unlist()

      dplyr::filter(metadata, LOCUS %in% check) %>%
        readr::write_tsv(x = ., file = prob.file)
      check <- prob.file <- NULL
      rlang::abort("The number of AlleleID/Locus is wrong.\nCheck the modifications you made on the original DArT file. \n\nFile written: dart_problem.tsv")
    }
  }

  # check if some rows are missing...
  if (anyNA(unique(metadata$REF))) {
    if (length(unique(metadata$MARKERS[!is.na(metadata$REF)])) != length(unique(metadata$MARKERS[is.na(metadata$REF)]))) {
      rlang::abort("The DArT file is missing rows...")
    }
  }

  # warning if the markers once arranged are not the same as the DArT file...
  if (!identical(unique.variant.id, order(unique.variant.id))) {
    rlang::abort("Mixed up markers order, contact author")
  }

  # Whitelist for the markers meta...
  if (!is.null(whitelist.markers)) {
    # read whitelist
    whitelist.markers <- radiator::read_whitelist(
      whitelist.markers = whitelist.markers,
      verbose = TRUE)

    common.markers <- (intersect(colnames(metadata), colnames(whitelist.markers)))
    want <- c("VARIANT_ID", "M_SEQ", "MARKERS", "CHROM", "LOCUS", "POS")

    want.markers <- purrr::keep(
      .x = common.markers,
      .p = common.markers %in% want
    )

    whitelist.markers %<>%
      dplyr::select(tidyselect::any_of(want.markers)) %>%
      dplyr::mutate(FILTERS = "whitelist")

    metadata %<>%
      dplyr::full_join(whitelist.markers, by = want.markers) %>%
      dplyr::filter(FILTERS == "whitelist")
  }

  metadata %<>%
    dplyr::mutate(FILTERS = "whitelist") %>% # again, for datasets without whitelist
    dplyr::filter(!is.na(REF) | !is.na(ALT)) %>%
    dplyr::distinct(VARIANT_ID, .keep_all = TRUE) %>%
    dplyr::select(
      FILTERS, M_SEQ, VARIANT_ID, MARKERS, CHROM, LOCUS, POS, COL, REF, ALT,
      tidyselect::everything(.)
    ) %>%
    dplyr::arrange(VARIANT_ID)
  return(list(metadata = metadata, variant.id = variant.id))
}#End read_dart_markers_metadata


# clean_dart_colnames----------------------------------------------------------------
#' @title clean_dart_colnames
#' @description clean_dart_colnames (only the DArT columns = snakecase...)
#' @rdname clean_dart_colnames
#' @keywords internal
#' @export
clean_dart_colnames <- function(x, dart.check, blacklist.id = NULL, strata = NULL) {

  metadata.ncol <- dart.check$star.number# the most reliable if available
  n.col.x <- length(colnames(x))

  if (metadata.ncol == 0) {
    if (is.null(blacklist.id)) {
      metadata.ncol <- n.col.x - length(strata$TARGET_ID)
    } else {
      metadata.ncol <- n.col.x - length(strata$TARGET_ID) - length(blacklist.id)
    }
  }

  # id.clean or metadata clean
  id.clean <- metadata.clean <- FALSE #default just the metadata

  if (is.null(strata)) {
    metadata.clean <- TRUE
  } else {
    id.clean <- TRUE
  }

  # clean the dart metadata cols
  # clean the dart target ids
  # here doing all colnames is ok, because keeper above is allready upper caps
  # added 20190528

  # We want snakecase not camelcase
  # Change the TARGET_ID by INDIVIDUALS...

  if (id.clean) {
    # colnames(x) <- c(
    #   radiator::radiator_snakecase(x = colnames(x)[1:metadata.ncol]),
    #   stringi::stri_trans_toupper(clean_ind_names(colnames(x)[-c(1:metadata.ncol)]))
    # )
    colnames(x) <- c(
      stringi::stri_trans_toupper(clean_ind_names(colnames(x)))
    )

    # remove blacklisted ids
    if (!is.null(blacklist.id)) {
      x %<>% dplyr::select(-tidyselect::any_of(blacklist.id))
    }

    # combined with strata
    colnames(x) <- tibble::tibble(TARGET_ID = colnames(x)) %>%
      dplyr::left_join(strata, by = "TARGET_ID") %>%
      dplyr::mutate(
        INDIVIDUALS = dplyr::if_else(
          is.na(INDIVIDUALS), TARGET_ID, INDIVIDUALS)
      ) %$% INDIVIDUALS

  } else {
    colnames(x) <- c(
      radiator::radiator_snakecase(x = colnames(x)))
  }


  # Variant Call Format SPECS----------------------------------------------------
  # keep consensus sequence if found
  # Changed 2020-02-06
  # TRIMMED_SEQUENCE
  # turns out I think it's better to keep ALLELE_SEQUENCE...

  # Define possible column names to check
  # possible.columns <- c("ALLELE_SEQUENCE", "TRIMMED_SEQUENCE", "CLUSTER_CONSENSUS_SEQUENCE")
  # Check for preferred column
  preferred.col <- dplyr::case_when(
    "ALLELE_SEQUENCE" %in% names(x) ~ "ALLELE_SEQUENCE",
    "TRIMMED_SEQUENCE" %in% names(x) ~ "TRIMMED_SEQUENCE",
    "CLUSTER_CONSENSUS_SEQUENCE" %in% names(x) ~ "CLUSTER_CONSENSUS_SEQUENCE",
    TRUE ~ NA_character_
  )



  # Rename columns dynamically
  x %<>% dplyr::rename_with(
    .fn = ~ "SEQUENCE",    # Rename to "SEQUENCE"
    .cols = tidyselect::any_of(preferred.col)  # Select columns from possible.columns
  )

  # For all  except SILICO ------------------------------------------
  silico.dart <- "silico.dart" %in% dart.check$dart.format

  if (!silico.dart) {
    # Changed in 2024 with missing AlleleID and 2 new column names: MarkerName and Variant
    # MarkerName is the same as AlleleID
    # SNP_POSITION is missing and remains embedded in MarkerName like it was in AlleleID

    # ALLELE_ID or CLONE_ID -> LOCUS
    # SNP_POSITION -> POS

    x %<>% dplyr::rename_with(
      .fn = ~ stringi::stri_replace_all_fixed(
        str = .,
        pattern = c("ALLELE_ID", "SNP_POSITION"),
        replacement = c("LOCUS", "POS"),
        vectorize_all = FALSE
      ),
      .cols = tidyselect::any_of(c("ALLELE_ID", "SNP_POSITION"))
    )

    # If CLONE_ID exists and LOCUS already exists → remove CLONE_ID.
    # If CLONE_ID exists but LOCUS doesn't → rename CLONE_ID to LOCUS.

    if ("CLONE_ID" %in% names(x)) {
      if ("LOCUS" %in% names(x)) {
        x %<>% dplyr::select(-CLONE_ID)
      } else {
        x %<>% dplyr::rename_with(
          .fn = ~ stringi::stri_replace_all_fixed(., "CLONE_ID", "LOCUS"),
          .cols = tidyselect::any_of("CLONE_ID")
        )
      }
    }



    # If Locus info not found beyond that point....

    # 1. If "MARKER_NAME" exists:
    ## Rename it to "LOCUS".
    # If "VARIANT" exists:
    ## Rename it to "VARIANT_DART".
    ## Create a new column SNP = VARIANT_DART.

    # 2. Else, abort.
    if (!"LOCUS" %in% names(x)) {
      if ("MARKER_NAME" %in% names(x)) {

        # Rename MARKER_NAME to LOCUS
        x %<>% dplyr::rename_with(
          .fn = ~ stringi::stri_replace_all_fixed(., "MARKER_NAME", "LOCUS"),
          .cols = tidyselect::any_of("MARKER_NAME")
        )

        if ("VARIANT" %in% names(x)) {
          # Rename VARIANT to VARIANT_DART
          x %<>% dplyr::rename_with(
            .fn = ~ stringi::stri_replace_all_fixed(., "VARIANT", "VARIANT_DART"),
            .cols = tidyselect::any_of("VARIANT")
          )

          # Copy VARIANT_DART to new column SNP
          x %<>% dplyr::mutate(SNP = .data$VARIANT_DART)
        }

      } else {
        rlang::abort("\nProblem tidying DArT dataset: contact author")
      }
    }

    # necessary steps...observed with DArT file using ref genome ---------------
    # x %<>% dplyr::filter(!is.na(LOCUS)) # not sure it's necessary after all

    # Check for duplicate rows (sometimes people combine DArT data...)
    if (rlang::has_name(x, "POS")) {
      # x %<>% dplyr::arrange(LOCUS, POS)
      data.dup <- nrow(dplyr::distinct(x, LOCUS, SNP, POS, CALL_RATE, .keep_all = FALSE))
    } else {
      # x %<>% dplyr::arrange(LOCUS)
      data.dup <- nrow(dplyr::distinct(x, LOCUS, SNP, CALL_RATE, .keep_all = FALSE))
    }

    # make sure no duplicates
    if (nrow(x) != data.dup) {
      rlang::abort("Duplicate rows were identified")
      # message("    using distinct rows")
      # message("    check data if downstream problems")
      # message("    better ways to combined DArT files ...")
      # data %<>% dplyr::distinct(LOCUS, SNP, POS, CALL_RATE, .keep_all = TRUE)
    }
    data.dup <- NULL

    # Screen for duplicate names -------------------------------------------------
    id <- purrr::discard(
      .x = colnames(x),
      .p = colnames(x) %in% c("LOCUS", "SNP", "POS", "CALL_RATE", "AVG_COUNT_REF",
                              "AVG_COUNT_SNP", "REP_AVG")
    )
    dup.id <- length(id) - length(unique(id))
    if (dup.id > 0) {
      rlang::abort(stringi::stri_join("Duplicated individual names in the data: ", dup.id))
    }
    id <- dup.id <- NULL # removing unused object

    # clean locus, generate MARKERS and VARIANT_ID
    x %<>% clean_dart_locus(.)
  }# End re formating for non silico dart data

  return(x)
}#End clean_dart_colnames

# clean_dart_locus--------------------------------------------------------------
#' @title clean_dart_locus
#' @description Clean LOCUS and generate VARIANT_ID and MARKERS
#' @rdname clean_dart_locus
#' @keywords internal
#' @export
clean_dart_locus <- function(x, fast = TRUE) {

  if (fast) {
    # x <- data
    if (!rlang::has_name(x, "CHROM")) x %<>% dplyr::mutate(CHROM = "CHROM_1")
    want <- c("VARIANT_ID", "M_SEQ", "MARKERS", "CHROM", "LOCUS", "POS", "COL",
              "REF", "ALT", "SEQUENCE",
              "CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG")

    if (rlang::has_name(x, "VARIANT_DART")) {
      x %<>%
        dplyr::mutate(
          COL = stringi::stri_extract_first_regex(
            str = LOCUS,
            pattern = "[-][0-9]+[\\:]"),
          COL = stringi::stri_replace_all_fixed(
            str = COL,
            pattern = c("-", ":"),
            replacement = c("", ""),
            vectorize_all = FALSE),
          POS = COL,
          COL = as.integer(COL),
          LOCUS = stringi::stri_extract_first_regex(str = LOCUS, pattern = "^[0-9]+"),
          REF = stringi::stri_extract_first_regex(str = VARIANT_DART, pattern = "[A-Z]"),
          ALT = stringi::stri_extract_last_regex(str = VARIANT_DART, pattern = "[A-Z]"),
          MARKERS = stringi::stri_join(CHROM, LOCUS, POS, sep = "__"),
          # VARIANT_ID = as.integer(factor(MARKERS)),
          M_SEQ = VARIANT_ID,
          VARIANT_DART = NULL,
          SNP = NULL
        ) %>%
        dplyr::select(tidyselect::any_of(want), tidyselect::everything()) %>%
        dplyr::mutate(
          dplyr::across(
            .cols = c(MARKERS, CHROM, LOCUS, POS),
            .fns = as.character
          )
        ) %>%
        dplyr::arrange(VARIANT_ID, CHROM, LOCUS, POS, REF)
    } else {
      x %<>%
        dplyr::mutate(
          COL = stringi::stri_extract_first_regex(
            str = LOCUS,
            pattern = "[-][0-9]+[\\:]"),
          COL = stringi::stri_replace_all_fixed(
            str = COL,
            pattern = c("-", ":"),
            replacement = c("", ""),
            vectorize_all = FALSE),
          COL = as.integer(COL),
          LOCUS = stringi::stri_extract_first_regex(str = LOCUS, pattern = "^[0-9]+"),
          REF = stringi::stri_extract_first_regex(str = SNP, pattern = "[A-Z]"),
          ALT = stringi::stri_extract_last_regex(str = SNP, pattern = "[A-Z]"),
          MARKERS = stringi::stri_join(CHROM, LOCUS, POS, sep = "__"),
          # VARIANT_ID = as.integer(factor(MARKERS)),
          M_SEQ = VARIANT_ID,
          SNP = NULL
        ) %>%
        dplyr::select(tidyselect::any_of(want), tidyselect::everything()) %>%
        dplyr::mutate(
          dplyr::across(
            .cols = c(MARKERS, CHROM, LOCUS, POS),
            .fns = as.character
          )
        ) %>%
        dplyr::arrange(VARIANT_ID, CHROM, LOCUS, POS, REF)
    }
  } else {
    want <- c("VARIANT_ID", "M_SEQ", "MARKERS", "CHROM", "LOCUS", "POS", "COL",
              "REF", "ALT", "SEQUENCE",
              "CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG")
    x %<>%
      tidyr::separate(col = LOCUS,
                      into = c("LOCUS", NA),
                      sep = "\\|",
                      extra = "drop"
      ) %>%
      tidyr::separate(col = SNP,
                      into = c(NA, "KEEPER"),
                      sep = ":",
                      extra = "drop") %>%
      tidyr::separate(col = KEEPER, into = c("REF", "ALT"), sep = ">") %>%
      dplyr::mutate(
        CHROM = rep("CHROM_1", n()),
        MARKERS = stringi::stri_join(CHROM, LOCUS, POS, sep = "__"),
        # VARIANT_ID = as.integer(factor(MARKERS)),
        M_SEQ = VARIANT_ID,
        COL = POS
      ) %>%
      dplyr::select(tidyselect::any_of(want), tidyselect::everything()) %>%
      dplyr::mutate(
        dplyr::across(
          .cols = c(MARKERS, CHROM, LOCUS, POS),
          .fns = as.character
        )
      ) %>%
      dplyr::arrange(VARIANT_ID, CHROM, LOCUS, POS, REF)
  }
}#End clean_dart_locus


# switch_allele_count----------------------------------------------------------------
#' @title switch_allele_count
#' @description Switch from dart genotyping 2 presence absence
#' @rdname switch_allele_count
#' @keywords internal
#' @export

switch_allele_count <- function(x, dart.check = FALSE, ref = TRUE) {
  # dart.group == TRUE (=1rows) vs presence/absence coding
  # dart.group == FALSE 2rows: presence/absence

  # 1row
  if (dart.check$dart.format == "1row") {
    # WE WANT THIS (dosage alternate allele):
    # 1 = HOM ALT
    # 2 = HET
    # 0 = HOM REF
    if (ref) {#REF count
      x <- dplyr::case_when(
        x == 1L ~ 0L,
        x == 2L ~ 1L,
        x == 0L ~ 1L
      )
    } else {#ALT count
      x <- dplyr::if_else(x == 2L, 1L, x)
      # the other coding remain the same
    }
  }

  # 2rows
  if (dart.check$dart.format == "2rows") {
    # 2rows: presence/absence
    # here we want count of alternate allele instead...
    # x <- as.integer(dplyr::recode(.x = as.character(x), "0" = "1", "1" = "0"))
    # case_when is much faster than recode...
    x <- dplyr::case_when(
      x == 0L ~ 1L,
      x == 1L ~ 0L
    )
  }
  return(x)
}# End switch_allele_count

# import_dart_geno----------------------------------------------------------------
#' @title import_dart_geno
#' @description Import DArt genotypes
#' @rdname import_dart_geno
#' @keywords internal
#' @export
import_dart_geno <- function(strata.split, variant.id, data, dart.check, parallel.core) {
  # test
  # strata.split <- strata.split[[1]]
  # Default
  new.col.names <- strata.split$ID_SEQ
  want.col <- strata.split$TARGET_ID

  if (dart.check$dart.format != "1row") {
    want.col <- c("SNP", strata.split$TARGET_ID)
    new.col.names <- c("SNP", strata.split$ID_SEQ)
  }
  gen <- suppressWarnings(
    vroom::vroom(
      file = data,
      delim = dart.check$tokenizer.dart,
      col_select = want.col,
      col_names = TRUE,
      skip = dart.check$skip.number,
      na = c("-", "NA",""),
      guess_max = 20,
      altrep = TRUE,
      skip_empty_rows = TRUE,
      trim_ws = FALSE,
      num_threads = parallel.core,
      progress = FALSE,
      show_col_types = FALSE
    )
  ) %>%
    magrittr::set_colnames(x = ., value = new.col.names) %>%
    tibble::add_column(
      VARIANT_ID = variant.id,
      .before = 1
    )
  new.col.names <- want.col <- NULL

  #1-row
  if ("1row" %in% dart.check$dart.format) {
    # 0: REF/REF, 1: ALT/ALT, 2: REF/ALT
    # We transform into PRESENCE / ABSENCE
    # REF_COUNT: 0 -> 1, 1 -> 0, 2 -> 1
    # ALT_COUNT: 0 -> 0, 1 -> 1, 1 -> 1


    gen %<>%
      radiator::rad_long(
        x = .,
        cols = "VARIANT_ID",
        names_to = "ID_SEQ",
        values_to = "GDS",
        tidy = TRUE
      ) %>%
      dplyr::mutate(
        REF_COUNT = dplyr::case_when(
          GDS == 1L ~ 0L,
          GDS == 2L ~ 1L,
          GDS == 0L ~ 1L
        ),
        ALT_COUNT = dplyr::if_else(GDS == 2L, 1L, GDS), # the other coding remain the same
        GDS = NULL
      )
  }#END 1row genotypes

  #2-rows or counts
  if (TRUE %in% (c("counts", "2rows") %in% dart.check$dart.format)) {
    # n.ind <- ncol(gen) - 2
    # n.snp <- nrow(gen) / 2

    ref.values <- "REF_COUNT"
    alt.values <- "ALT_COUNT"
    if (dart.check$dart.format == "counts") {
      ref.values <- "ALLELE_REF_DEPTH"
      alt.values <- "ALLELE_ALT_DEPTH"
    }

    gen <- suppressMessages(
      dplyr::filter(gen, is.na(SNP)) %>%
        dplyr::select(-SNP) %>%
        radiator::rad_long(
          x = .,
          cols = "VARIANT_ID",
          names_to = "ID_SEQ",
          values_to = ref.values,
          tidy = TRUE
        ) %>%
        dplyr::bind_cols(
          dplyr::filter(gen, !is.na(SNP)) %>%
            dplyr::select(-SNP) %>%
            radiator::rad_long(
              x = .,
              cols = "VARIANT_ID",
              names_to = "ID_SEQ",
              values_to = alt.values,
              tidy = TRUE
            )
        )
    )

    if (dart.check$dart.format == "counts") {
      gen %<>% dplyr::mutate(
        REF_COUNT = dplyr::if_else(ALLELE_REF_DEPTH > 0, 1, ALLELE_REF_DEPTH),
        ALT_COUNT = dplyr::if_else(ALLELE_ALT_DEPTH > 0, 1, ALLELE_ALT_DEPTH)
      )
    }

    # Faster to check that the bind_cols worked by checking the variant id
    if (!identical(gen$VARIANT_ID...1, gen$VARIANT_ID...4)) {
      rlang::abort("Contact author, DArT tiding problem")
    } else {
      gen %<>% dplyr::select(-VARIANT_ID...4) %>% dplyr::rename(VARIANT_ID = VARIANT_ID...1)
    }

    # Faster to check that the bind_cols worked by checking ID_SEQ
    if (!identical(gen$ID_SEQ...2, gen$ID_SEQ...5)) {
      rlang::abort("Contact author, DArT tiding problem")
    } else {
      gen %<>% dplyr::select(-ID_SEQ...5) %>% dplyr::rename(ID_SEQ = ID_SEQ...2)
    }
  } #2rows genotypes
  return(gen)
}#End import_dart_geno

# detect_calibration_problem----------------------------------------------------------------
#' @title detect_calibration_problem
#' @description Detect DArT with REF and ALT not calibrated... based on alleles or depth
#' @rdname detect_calibration_problem
#' @keywords internal
#' @export
detect_calibration_problem <- function(x, dart.check) {

  if ("counts" %in% dart.check$dart.format) {
    switch <- x %>%
      dplyr::summarise(
        ALLELE_REF_DEPTH = sum(ALLELE_REF_DEPTH, na.rm = TRUE),
        ALLELE_ALT_DEPTH = sum(ALLELE_ALT_DEPTH, na.rm = TRUE),
        .by = "VARIANT_ID"
      ) %>%
      dplyr::filter(ALLELE_REF_DEPTH < ALLELE_ALT_DEPTH) %>%
      dplyr::pull(VARIANT_ID)
  } else {
    switch <- x %>%
      dplyr::summarise(
        REF_COUNT = sum(REF_COUNT, na.rm = TRUE),
        ALT_COUNT = sum(ALT_COUNT, na.rm = TRUE),
        .by = "VARIANT_ID"
      ) %>%
      dplyr::filter(REF_COUNT < ALT_COUNT) %>%
      dplyr::pull(VARIANT_ID)
  }

  n.switch <- length(switch)

  if (n.switch > 0) {
    if ("counts" %in% dart.check$dart.format) {
      count.what <- "read depth"
    } else {
      count.what <- "alleles"
    }
    message("Calibration required...")
    message("Calibration of alleles based on counts of ", count.what)
    message("Number of markers impacted: ", n.switch)
  }
  return(list(switch = switch, n.switch = n.switch))
}#End detect_calibration_problem

# dart_calibration----------------------------------------------------------------
#' @title dart_calibration
#' @description Calibrate REF and ALT in DArT imported genotypes or metadata
#' @rdname dart_calibration
#' @keywords internal
#' @export
dart_calibration <- function(switch, genotypes = NULL, metadata = NULL) {
  # genotypes recalibration
  if (!is.null(genotypes)) {
    message("Calibration of REF and ALT alleles in genotypes")

    if (rlang::has_name(genotypes, "ALLELE_REF_DEPTH")) {
      new.gen <- dplyr::bind_rows(
        dplyr::filter(genotypes, !VARIANT_ID %in% switch),
        dplyr::filter(genotypes, VARIANT_ID %in% switch) %>%
          dplyr::rename(
            ALT_COUNT = REF_COUNT,
            REF_COUNT = ALT_COUNT,
            ALLELE_ALT_DEPTH = ALLELE_REF_DEPTH,
            ALLELE_REF_DEPTH = ALLELE_ALT_DEPTH
          )
      )
    } else {
      new.gen <- dplyr::bind_rows(
        dplyr::filter(genotypes, !VARIANT_ID %in% switch),
        dplyr::filter(genotypes, VARIANT_ID %in% switch) %>%
          dplyr::rename(ALT_COUNT = REF_COUNT, REF_COUNT = ALT_COUNT)
      )
    }
    return(new.gen)
  }

  # metadata recalibration
  if (!is.null(metadata)) {
    message("Calibration of REF and ALT alleles in markers metadata")
    new.metadata <- dplyr::filter(metadata, VARIANT_ID %in% switch)

    switch.ref <- dplyr::select(new.metadata, dplyr::contains("REF")) %>%
      dplyr::select(-dplyr::contains("REF_AVG"))


    colnames(switch.ref) <- stringi::stri_replace_all_regex(
      str = colnames(switch.ref), pattern = c("_REF", "^REF"),
      replacement = c("_SNP", "ALT"), vectorize_all = FALSE
    )

    switch.alt <- dplyr::select(new.metadata, dplyr::contains(c("ALT", "SNP"))) %>%
      dplyr::select(-dplyr::contains("REF_AVG"))
    colnames(switch.alt) <- stringi::stri_replace_all_fixed(
      str = colnames(switch.alt), pattern = c("SNP", "ALT"),
      replacement = "REF", vectorize_all = FALSE
    )
    new.metadata %<>%
      dplyr::select(-tidyselect::any_of(unique(c(colnames(switch.ref), colnames(switch.alt))))) %>%
      dplyr::bind_cols(switch.ref) %>%
      dplyr::bind_cols(switch.alt)
    switch.alt <- switch.ref <- NULL

    metadata %<>%
      dplyr::filter(!VARIANT_ID %in% switch) %>%
      dplyr::bind_rows(new.metadata) %>%
      dplyr::arrange(VARIANT_ID)
    new.metadata <- NULL
    return(metadata)
  }

}#End dart_calibration


# dart_genotyping_strategy----------------------------------------------------------------
#' @title dart_genotyping_strategy
#' @description Generating genotypes coding is computer costly, strategy ...
#' @rdname dart_genotyping_strategy
#' @keywords internal
#' @export
dart_genotyping_strategy <- function(
    n.markers,
    gt.vcf = NULL,
    gt.vcf.nuc = NULL,
    gt = NULL
) {
  # Depending on the number of markers ...
  # gt.vcf is genotype coding in the VCF: 0/0, 0/1, 1/1, ./.

  if (is.null(gt.vcf)) {
    if (n.markers < 5000) gt.vcf <- TRUE
    if (n.markers >= 5000 && n.markers < 30000) gt.vcf <- TRUE
    if (n.markers >= 30000) gt.vcf <- FALSE
  } else {
    gt.vcf <- gt.vcf
  }
  # gt.vcf.nuc is genotype coding in the VCF but with nucleotides: A/C, ./.
  if (is.null(gt.vcf.nuc)) {
    if (n.markers < 5000) gt.vcf.nuc <- TRUE
    if (n.markers >= 5000 && n.markers < 30000) gt.vcf.nuc <- FALSE
    if (n.markers >= 30000) gt.vcf.nuc <- FALSE
  } else {
    gt.vcf.nuc <- gt.vcf.nuc
  }
  # gt is genotype coding a la genepop: 001002, 000000
  if (is.null(gt)) {
    if (n.markers < 5000) gt <- TRUE
    if (n.markers >= 5000 && n.markers < 30000) gt <- FALSE
    if (n.markers >= 30000) gt <- FALSE
  } else {
    gt <- gt
  }
  return(list(gt.vcf = gt.vcf, gt.vcf.nuc = gt.vcf.nuc, gt = gt))
}# End dart_genotyping_strategy

# dart_genotyper ----------------------------------------------------------------
#' @title dart_genotyper
#' @description Normalization of DArT genotypes
#' @rdname dart_genotyper
#' @keywords internal
#' @export
dart_genotyper <- function(
    x,
    dart.check,
    markers.metadata = NULL,
    gt.vcf = FALSE,
    gt.vcf.nuc = FALSE,
    gt = FALSE
) {
  message("DArT genotypes normalization")

  cli::cli_progress_step(msg = "big files takes more time...", msg_done = "done")

  if (gt) gt.vcf.nuc <- TRUE

  if (gt.vcf.nuc && is.null(markers.metadata)) {
    rlang::abort("Problem during DArT genotypes normalization write to author...")
  } else {
    markers.metadata %<>% dplyr::select(VARIANT_ID, M_SEQ, REF, ALT)
  }

  # Dosage of ALTernate allele -------------------------------------------------
  # modify genotypes meta to generate GT_BIN
  if ("counts" %in% dart.check$dart.format) {

    x %<>%
      dplyr::mutate(
        READ_DEPTH = ALLELE_REF_DEPTH + ALLELE_ALT_DEPTH,
        GT_BIN = dplyr::case_when(
          ALLELE_REF_DEPTH > 0L & ALLELE_ALT_DEPTH == 0L ~ 0L,
          ALLELE_REF_DEPTH > 0L & ALLELE_ALT_DEPTH > 0L ~ 1L,
          ALLELE_REF_DEPTH == 0L & ALLELE_ALT_DEPTH > 0L ~ 2L
        )
      )

    # having 0 count is not equal to GT_BIN = NA... if the other allele has count...
    # in this case it's a real 0 for no counts.
    missing <- dplyr::filter(x, is.na(GT_BIN)) %>%
      dplyr::mutate(
        dplyr::across(
          .cols = c(READ_DEPTH, ALLELE_REF_DEPTH, ALLELE_ALT_DEPTH, REF_COUNT, ALT_COUNT),
          .fns = ~ replace_by_na(data = ., what = 0L)
        )
      )

    x %<>% dplyr::filter(!is.na(GT_BIN)) %>%
      dplyr::bind_rows(missing) %>%
      dplyr::full_join(markers.metadata, by = "VARIANT_ID")

    missing <- NULL
  } else {# for presence and absence: 1row and 2rows
    x %<>%
      dplyr::mutate(
        GT_BIN = REF + ALT,
        REF = NULL,
        ALT = NULL
      ) %>%
      dplyr::full_join(markers.metadata, by = "VARIANT_ID")
  }

  # generate VCF format --------------------------------------------------------
  if (gt.vcf) {
    x %<>%
      dplyr::mutate(
        GT_VCF = dplyr::case_when(
          GT_BIN == 0 ~ "0/0", GT_BIN == 1 ~ "0/1", GT_BIN == 2 ~ "1/1",
          is.na(GT_BIN) ~ "./.")
      )
  }

  # generate VCFnuc format -----------------------------------------------------
  if (gt.vcf.nuc) {
    if (!rlang::has_name(x, "M_SEQ")) {
      x %<>%
        dplyr::full_join(markers.metadata, by = "VARIANT_ID")
    }

    x %<>%
      dplyr::mutate(
        GT_VCF_NUC = dplyr::case_when(
          GT_BIN == 0 ~ stringi::stri_join(REF, REF, sep = "/"),
          GT_BIN == 1 ~ stringi::stri_join(REF, ALT, sep = "/"),
          GT_BIN == 2 ~ stringi::stri_join(ALT, ALT, sep = "/"),
          is.na(GT_BIN) ~ "./.")
      )
  }
  # generate GT format ---------------------------------------------------------
  if (gt) {
    x %<>%
      dplyr::mutate(
        GT = stringi::stri_replace_all_fixed(
          str = GT_VCF_NUC,
          pattern = c("A", "C", "G", "T", "/", ".."),
          replacement = c("001", "002", "003", "004", "", "000000"),
          vectorize_all = FALSE)
      )
  }

  # GDS array format -----------------------------------------------------------
  # generate genotypes format for easy reading into GDS
  x %<>%
    dplyr::mutate(
      GDS_A1 = dplyr::case_when(
        GT_BIN == 0 ~ 0,
        GT_BIN == 1 ~ 0,
        GT_BIN == 2 ~ 1,
        is.na(GT_BIN) ~ NA_integer_),
      GDS_A2 = dplyr::case_when(
        GT_BIN == 0 ~ 0,
        GT_BIN == 1 ~ 1,
        GT_BIN == 2 ~ 1,
        is.na(GT_BIN) ~ NA_integer_)
    )

  # Sorting by individuals and variant -----------------------------------------
  x %<>% dplyr::arrange(ID_SEQ, VARIANT_ID)
  cli::cli_progress_done()
  message("Genotypes formats generated: ")
  message("GT_BIN: genotypes based on dosage of alternate allele: 0, 1, 2, NA")
  if (gt.vcf) message("GT_VCF: genotypes based on VCF format with dosage of alternate allele: 0/0, 0/1, 1/1, ./.)")
  if (gt.vcf.nuc) message("GT_VCF: genotypes based on VCF format with alleles in nucleotide format: A/C, ./.")
  if (gt) message("GT: genotypes based on genepop format 6 digits: 001002, 001001, 000000")
  return(x)
}#End dart_genotyper

# dart2gds----------------------------------------------------------------
#' @title dart2gds
#' @description Transform dart to GDS format
#' @rdname dart2gds
#' @keywords internal
#' @export

dart2gds <- function(
    genotypes,
    strata = NULL,
    markers.meta,
    filename.gds,
    dart.check,
    parallel.core = parallel::detectCores() - 1,
    verbose = TRUE
) {

  # Generate GDS
  x <- radiator_gds(
    genotypes = gt2array(
      genotypes = genotypes, n.ind = nrow(strata), n.snp = nrow(markers.meta)
    ),
    biallelic = TRUE,
    data.source = dart.check$data.type,
    strata = strata,
    markers.meta = markers.meta,
    genotypes.meta = genotypes,
    filename = filename.gds,
    open = TRUE,
    verbose = verbose
  )

  if (verbose) message("done!")
  return(x)
}# End dart2gds


# tidy_dart_metadata--------------------------------------------------------------
#' @name tidy_dart_metadata

#' @title Import and tidy \href{http://www.diversityarrays.com}{DArT} metadata.

#' @description Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#' The function generate a tidy dataset of
#' \href{http://www.diversityarrays.com}{DArT} markers and associated metadata.
#' Usefull to filter before importing the actual dataset.

#' @param data DArT output file. Note that most popular formats used by DArT are
#' recognised (1- and 2- row format, also called binary, and count data.).
#' If you encounter a problem, sent me your data so that I can update
#' the function. The function can import \code{.csv} or \code{.tsv} files.


#' @inheritParams tidy_genomic_data
#' @inheritParams radiator_common_arguments

#' @return A tidy dataframe with these columns:
#' \enumerate{
#' \item MARKERS: generated by radiator and correspond to CHROM + LOCUS + POS
#' separated by 2 underscores.
#' \item CHROM: the chromosome, for de novo: CHROM_1.
#' \item LOCUS: the locus.
#' \item POS: the SNP id on the LOCUS.
#' \item REF: the reference allele.
#' \item ALT: the alternate allele.
#' \item CALL_RATE: call rate output specific of DArT.
#' \item AVG_COUNT_REF: the coverage for the reference allele, output specific of DArT.
#' \item AVG_COUNT_SNP: the coverage for the alternate allele, output specific of DArT.
#' \item REP_AVG: the reproducibility average, output specific of DArT.
#' }

#' @export
#' @rdname tidy_dart_metadata

#' @examples
#' \dontrun{
#' clownfish.dart.tidy <- radiator::tidy_dart_metadata(
#' data = "clownfish.dart.tsv",
#' verbose = TRUE)
#' }

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

tidy_dart_metadata <- function(
    data,
    filename = NULL,
    verbose = FALSE,
    parallel.core = parallel::detectCores() - 1
) {
  # Cleanup-------------------------------------------------------------------
  # obj.keeper <- c(ls(envir = globalenv()), "input")
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  # res <- list()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "", start = FALSE, verbose = verbose), add = TRUE)
  # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))


  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("Input file missing")
  # Date and Time --------------------------------------------------------------
  if (!is.null(filename)) {
    meta.filename <- generate_filename(
      name.shortcut = stringi::stri_join(filename, "_metadata"),
      extension = "tsv"
    )$filename
  }

  data.type <- radiator::detect_genomic_format(data)

  if (!data.type %in% c("dart", "fst.file")) {
    rlang::abort("Contact author to show your DArT data, problem duting import")
  }

  if (verbose) message("Importing DArT markers metadata")

  # Import metadata-------------------------------------------------------------
  if (data.type == "dart") {
    dart.delim <- "," # for csv files

    if (stringi::stri_detect_fixed(
      str = stringi::stri_sub(str = data, from = -4, to = -1),
      pattern = ".tsv")) {
      dart.delim <- "\t" # for tsv files
    }

    dart.check <- detect_dart_format(data)
    if (!dart.check$data.type %in% c("dart", "silico.dart")) {
      rlang::abort("Contact author to show your DArT data, problem during import")
    } else {
      skip.number <- dart.check$skip.number
    }

    dart.col.type <- readr::read_delim(
      file = data,
      delim = dart.delim,
      skip = skip.number,
      n_max = 1,
      na = "-",
      col_names = FALSE,
      col_types = readr::cols(.default = readr::col_character()))


    want <- tibble::tibble(
      INFO = c("ALLELEID", "SNP", "SNPPOSITION", "CALLRATE",
               "AVGCOUNTREF", "AVGCOUNTSNP", "REPAVG"),
      COL_TYPE = c("c", "c", "i", "d", "d", "d", "d"))

    dart.col.type %<>%
      radiator::rad_long(
        x = .,
        cols = tidyselect::everything(),
        names_to = "DELETE",
        values_to = "INFO"
      ) %>%
      dplyr::select(-DELETE) %>%
      dplyr::mutate(INFO = stringi::stri_trans_toupper(INFO)) %>%
      dplyr::left_join(want, by = "INFO") %>%
      dplyr::mutate(COL_TYPE = stringi::stri_replace_na(str = COL_TYPE, replacement = "_")) %>%
      dplyr::select(COL_TYPE) %>%
      purrr::flatten_chr(.) %>% stringi::stri_join(collapse = "")

    input <- suppressMessages(
      suppressWarnings(
        readr::read_delim(
          file = data,
          delim = dart.delim,
          skip = skip.number,
          na = "-",
          col_names = TRUE,
          col_types = dart.col.type)
      )
    )
    colnames(input) <- stringi::stri_trans_toupper(colnames(input))
    colnames(input) <- stringi::stri_replace_all_fixed(
      str = colnames(input),
      pattern = c("AVGCOUNTREF", "AVGCOUNTSNP", "REPAVG", "ALLELEID", "SNPPOSITION", "CALLRATE"),
      replacement = c("AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG", "LOCUS", "POS", "CALL_RATE"),
      vectorize_all = FALSE)

    # Check for duplicate rows (sometimes people combine DArT data...)----------
    input.dup <- dplyr::distinct(input, LOCUS, SNP, POS, CALL_RATE, .keep_all = FALSE)

    # make sure no duplicates
    if (nrow(input) != nrow(input.dup)) {
      input.dup <- NULL
      message("Duplicate rows were identified")
      message("    using distinct rows")
      message("    check input data if downstream problems")
      input <- dplyr::distinct(input, LOCUS, SNP, POS, CALL_RATE, .keep_all = TRUE)
    }
    input.dup <- NULL

    # Tidying data ---------------------------------------------------------------
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT", "CALL_RATE",
              "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG")

    input %<>%
      tidyr::separate(col = LOCUS, into = c("LOCUS", "NOT_USEFUL"), sep = "\\|", extra = "drop") %>%
      dplyr::select(-NOT_USEFUL) %>%
      tidyr::separate(col = SNP, into = c("NOT_USEFUL", "KEEPER"), sep = ":", extra = "drop") %>%
      dplyr::select(-NOT_USEFUL) %>%
      tidyr::separate(col = KEEPER, into = c("REF", "ALT"), sep = ">") %>%
      dplyr::mutate(
        CHROM = rep("CHROM_1", n()),
        MARKERS = stringi::stri_join(CHROM, LOCUS, POS, sep = "__")) %>%
      dplyr::select(tidyselect::any_of(want), tidyselect::everything()) %>%
      dplyr::mutate(
        dplyr::across(
          .cols = c(MARKERS, CHROM, LOCUS, POS),
          .fns = as.character
        )
      ) %>%
      dplyr::arrange(CHROM, LOCUS, POS, REF) %>%
      dplyr::filter(!is.na(REF) | !is.na(ALT)) %>%
      dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
      dplyr::arrange(MARKERS)
  }

  if (data.type == "fst.file") {
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT", "CALL_RATE",
              "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG")
    input <- radiator::read_rad(data) %>%
      dplyr::select(tidyselect::any_of(want)) %>%
      dplyr::distinct(MARKERS, .keep_all = TRUE)
  }

  if (!is.null(filename)) {
    readr::write_tsv(
      x = input,
      file = meta.filename,
    )
    if (verbose) message("File written: ",  basename(filename))
    # short.name <- list.files(path = ".", pattern = "metadata")
    #
    # if (length(short.name) > 1) {
    #   short.name <- file.info(short.name) %>%
    #     tibble::rownames_to_column(df = ., var = "FILE") %>%
    #     dplyr::filter(mtime == max(mtime))
    #   short.name <- short.name$FILE
    # }
    # message("Marker's metadata file written:\n    ", short.name)
  }
  # Results --------------------------------------------------------------------
  if (verbose) {
    n.chrom <- length(unique(input$CHROM))
    n.locus <- length(unique(input$LOCUS))
    n.snp <- length(unique(input$MARKERS))
    message("\nNumber of chrom: ", n.chrom)
    message("Number of locus: ", n.locus)
    message("Number of SNPs: ", n.snp)
  }
  return(input)
}#End tidy_dart_metadata




## Merge dart-------------------------------------------------------------------

#' @title Merge DArT files
#' @description This function allows to merge 2 DArT files (filtered of not).

#' @param dart1 Full path of the first DArT file.
#' @param strata1 Full path of the first strata file for dart1.
#' @param dart2 Full path of the second DArT file.
#' @param strata2 Full path of the second strata file for dart2.

# deprecated
# @param keep.rad Unless the default is changed, the function removes the
# temporary tidy dart files generated for each DArT datasets during import.
# Default: \code{keep.rad = FALSE}.

#' @param filename Name of the merged DArT file.
#' By default, the function gives the merged data the filename:\code{merge_dart}
#' with date and time appended. The function will also append the date and time
#' to the filename provided.
#' The data is written in the working directory.
#' Default: \code{filename = NULL}.

#' @inheritParams tidy_genomic_data
#' @inheritParams read_dart
#' @inheritParams radiator_common_arguments
#' @inheritParams filter_common_markers
#' @inheritParams filter_monomorphic

#' @param remove.non.immortalized.dart.markers (logical). By default the function
#' will remove markers starting \code{1000}, those are called \code{non-immortalized}
#' markers by DArT.
#' Default: \code{remove.non.immortalized.dart.markers = TRUE}.

#' @details
#' The function average across markers the columns: CALL_RATE, REP_AVG,
#' AVG_COUNT_REF and AVG_COUNT_SNP, when found in the data.
#' For DArT, theses columns represent:
#' \itemize{
#' \item CALL_RATE: is the proportion of samples for which the genotype was called.
#' \item REP_AVG: is the proportion of technical replicate assay pairs for which
#' the marker score is consistent.
#' \item AVG_COUND_REF and AVG_COUND_SNP: the mean coverage for the reference and
#' alternate alleles, respectively.
#' }
#'
#' The function removes markers with starting with 1000 that are not immortalized by DArT
#'
#'
#' When the argument \strong{common.markers} is kept to TRUE, the function
#' produces an
#' \href{https://github.com/hms-dbmi/UpSetR}{UpSet plot} to visualize the number
#' of markers common or not between populations. The plot is not saved automatically,
#' this as to be done manually by the user.

#' @return The function returns a list in the global environment and 2 data frames
#' in the working directory. The dataframes are the tidy dataset and a strata file
#' of the 2 merged DArT files.

#' @examples
#' \dontrun{
#' # The simplest way to run the function:
#' merged.data <- radiator::merge_dart(
#' dart1 = "bluefin_larvae.tsv", strata1 = "strata1_bft_larvae.tsv",
#' dart2 = "bluefin_adults.csv", strata2 = "strata2_bft_adults.tsv")
#' }
#' @keywords internal
#' @rdname merge_dart
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

merge_dart <- function(
    dart1, strata1,
    dart2, strata2,
    filter.monomorphic = TRUE,
    filter.common.markers = TRUE,
    # keep.rad = FALSE,
    filename = NULL,
    remove.non.immortalized.dart.markers = TRUE,
    parallel.core = parallel::detectCores() - 1,
    ...
) {

  # test
  # keep.rad = TRUE
  # filename = NULL
  # filter.monomorphic = TRUE
  # filter.common.markers = TRUE
  # parallel.core = 12L
  # remove.non.immortalized.dart.markers = TRUE

  # Cleanup---------------------------------------------------------------------
  obj.keeper <- c(ls(envir = globalenv()), "res")
  radiator_function_header(f.name = "merge_dart", verbose = TRUE)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(future.globals.maxSize= Inf)
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
  on.exit(radiator_function_header(f.name = "merge_dart", start = FALSE, verbose = TRUE), add = TRUE)

  # manage missing arguments -----------------------------------------------------
  if (missing(dart1)) rlang::abort("dart1 file missing")
  if (missing(dart2)) rlang::abort("dart2 file missing")
  if (missing(strata1)) rlang::abort("strata1 file missing")
  if (missing(strata2)) rlang::abort("strata2 file missing")

  # Filename -------------------------------------------------------------------
  # Get date and time to have unique filenaming

  if (is.null(filename)) {
    filename <- generate_filename(
      name.shortcut = "merged_dart",
      extension = "arrow.parquet"
    )$filename

    strata.filename <- generate_filename(
      name.shortcut = "merged_dart_strata",
      extension = "tsv"
    )$filename
  } else {
    filename <- generate_filename(
      name.shortcut = filename,
      extension = "arrow.parquet"
    )$filename

    strata.filename <- generate_filename(
      name.shortcut = stringi::stri_join(filename, "_strata"),
      extension = "tsv"
    )$filename
  }

  # File type detection---------------------------------------------------------
  data.type <- radiator::detect_genomic_format(dart2)
  if (data.type != "dart") rlang::abort("DArT file not recognize by radiator")
  data.type <- radiator::detect_genomic_format(dart1)
  if (data.type != "dart") rlang::abort("DArT file not recognize by radiator")

  # import data ----------------------------------------------------------------
  # generate the folder to store the merged data
  path.folder <- generate_folder(
    rad.folder = "merge_dart",
    path.folder = NULL,
    prefix.int = FALSE,
    internal = FALSE,
    file.date = file.date,
    verbose = TRUE)

  # the DArT data
  message("Importing and tidying dart1...")
  input <- suppressWarnings(
    radiator::read_dart(
      data = dart1,
      strata = strata1,
      filename = "dart1",
      tidy.dart = TRUE,
      parallel.core = parallel.core,
      path.folder = path.folder,
      verbose = FALSE
    )
  )


  message("Importing and tidying dart2...")
  dart2 <- suppressWarnings(
    radiator::read_dart(
      data = dart2,
      strata = strata2,
      filename = "dart2",
      tidy.dart = TRUE,
      parallel.core = parallel.core,
      path.folder = path.folder,
      verbose = FALSE
    )
  )

  # cleaning up non-immortalized markers ---------------------------------------
  markers.before <- length(unique(input$MARKERS)) + length(unique(dart2$MARKERS))
  if (remove.non.immortalized.dart.markers) {
    message("Removing non-immortalized DArT markers...")
    input <- suppressWarnings(dplyr::filter(input, !stringi::stri_detect_regex(str = LOCUS, pattern = "^1000")))
    dart2 <- suppressWarnings(dplyr::filter(dart2, !stringi::stri_detect_regex(str = LOCUS, pattern = "^1000")))
    markers.after <- length(unique(input$MARKERS)) + length(unique(dart2$MARKERS))
    message("    Number of blacklisted markers: ", markers.before - markers.after)
  }
  # merging DArT tidy data -----------------------------------------------------
  # we keep common column
  dart.col <- dplyr::intersect(colnames(input), colnames(dart2))

  input %<>%
    dplyr::select(tidyselect::any_of(dart.col)) %>%
    dplyr::bind_rows(dplyr::select(dart2, tidyselect::any_of(dart.col)))
  dart2 <- NULL



  # Remove weird markers ---------------------------------------------------------
  setwd(path.folder)
  biallelic.data <- NULL # global variables...
  input <- radiator::detect_biallelic_problems(
    data = input,
    verbose = TRUE,
    parallel.core = parallel.core
  ) %$%
    biallelic.data
  setwd(old.dir)

  if (rlang::has_name(input, "POLYMORPHIC")) {
    input %<>% dplyr::select(-POLYMORPHIC)
  }

  # Averaging across markers the call rate and other DArT markers metadata statistics
  if (rlang::has_name(input, "CALL_RATE")) {
    message("Averaging across markers the call rate")
    input %<>%
      dplyr::group_by(MARKERS) %>%
      dplyr::mutate(CALL_RATE = mean(CALL_RATE)) %>%
      dplyr::ungroup(.)
  }

  if (rlang::has_name(input, "REP_AVG")) {
    message("Averaging across markers the REP_AVG")
    input %<>%
      dplyr::group_by(MARKERS) %>%
      dplyr::mutate(REP_AVG = mean(REP_AVG)) %>%
      dplyr::ungroup(.)
  }

  if (rlang::has_name(input, "AVG_COUNT_REF")) {
    message("Averaging across markers the coverage for the REF allele")
    input %<>%
      dplyr::group_by(MARKERS) %>%
      dplyr::mutate(AVG_COUNT_REF = mean(AVG_COUNT_REF)) %>%
      dplyr::ungroup(.)
  }

  if (rlang::has_name(input, "AVG_COUNT_ALT")) {
    message("Averaging across markers the coverage for the ALT allele")
    input %<>%
      dplyr::group_by(MARKERS) %>%
      dplyr::mutate(AVG_COUNT_REF = mean(AVG_COUNT_REF)) %>%
      dplyr::ungroup(.)
  }

  # monomorphic markers (the TRUE/FALSE is included inside the function)
  input <- radiator::filter_monomorphic(
    data = input,
    filter.monomorphic = filter.monomorphic,
    path.folder = path.folder,
    verbose = TRUE)

  # common markers (the TRUE/FALSE is included inside the function)
  input <- filter_common_markers(
    data = input,
    filter.common.markers = filter.common.markers,
    path.folder = path.folder,
    verbose = TRUE
  )


  # Write tidy in the working directory
  radiator::write_rad(data = input, filename = filename)

  strata <- radiator::generate_strata(data = input, pop.id = FALSE)
  readr::write_tsv(x = strata, file = strata.filename)

  # results --------------------------------------------------------------------
  message("\nMerged DArT file and strata file generated:")
  message("    ", filename)
  message("    ", strata.filename)

  message("\nNumber of chrom / locus / SNP:")
  message(length(unique(input$CHROM)), " / ", length(unique(input$LOCUS)), " / ",
          length(unique(input$MARKERS)), "\n")
  radiator::summary_strata(strata = strata)
  return(res = list(merged.dart = input, strata = strata))
}#End merge_dart
thierrygosselin/radiator documentation built on July 4, 2025, 7:52 a.m.