R/detect_biallelic_problems.R

Defines functions detect_biallelic_problems

Documented in detect_biallelic_problems

# Detect the number of alleles/nucleotides per bi-allelic markers

#' @name detect_biallelic_problems

#' @title Detect biallelic problems

#' @description Detect the number of alleles/nucleotides per markers.
#' Sometimes, a dataset might have 3 alleles at a SNP, is this biological or artifactual ?
#' This function helps to resolve this, by highlighting markers with this potential
#' problem, so that user can further look at the origin of the phenomenon.
#' The function can also split datasets in biallelic/multiallelic datasets.

#' @param data A tidy data frame object in the global environment or
#' a tidy data frame in wide or long format in the working directory.
#' The tidy dataset needs a column with nucleotide information \code{GT_VCF_NUC},
#' usually this is automatically generated by \pkg{radiator}.
#' \emph{How to get a tidy data frame ?}
#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.

#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty
#' during execution.
#' Default: \code{verbose = TRUE}.

#' @param parallel.core (optional) The number of core used for parallel
#' execution.
#' Default: \code{parallel.core = parallel::detectCores() - 1}.


#' @return Several info

#' @export
#' @rdname detect_biallelic_problems
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

detect_biallelic_problems <- function(
  data,
  verbose = TRUE,
  parallel.core = parallel::detectCores() - 1
  ) {

  # Cleanup-------------------------------------------------------------------
  radiator_function_header(f.name = "detect_biallelic_problems", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "detect_biallelic_problems", start = FALSE, verbose = verbose), add = TRUE)
  res <- list()

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("Input file missing")
  # Get data and time and generate a filename
  # in case problematic markers are found and printed
  # filename <- stringi::stri_join("radiator_biallelic_problem_", format(Sys.time(), "%Y%m%d@%H%M"), ".tsv")

  # Import data ---------------------------------------------------------------
  if (is.vector(data)) {
    data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
  }
  if (!tibble::has_name(data, "GT_VCF_NUC")) {
    message("Tidy dataset requires genotypes with nuclotides: GT_VCF_NUC")
    data <- radiator::calibrate_alleles(
      data = data,
      biallelic = TRUE,
      verbose = TRUE,
      gt.vcf.nuc = TRUE
    ) %$%
      input
  }

  if (!tibble::has_name(data, "GT_VCF_NUC")) {
    message("Wrong genotype format")
    message("Tidy dataset requires genotypes with nuclotides: GT_VCF_NUC")
    rlang::abort("One way to get it is to use radiator::calibrate_alleles")
  }

  # Check data ----------------------------------------------------------
  message("Generating statistics...")
  markers.metadata <- purrr::keep(
    .x = colnames(data),
    .p = colnames(data) %in% c("MARKERS", "CHROM", "LOCUS", "POS"))
  want <- c("MARKERS", "CHROM", "LOCUS", "POS", "GT_VCF_NUC")
  blacklist.markers <- dplyr::select(data, tidyselect::any_of(want)) %>%
    dplyr::filter(GT_VCF_NUC != "./.") %>%
    dplyr::distinct(MARKERS, GT_VCF_NUC, .keep_all = TRUE) %>%
    separate_gt(x = ., gt = "GT_VCF_NUC", exclude = markers.metadata, split.chunks = 3L, haplotypes = TRUE) %>%
    dplyr::distinct(MARKERS, HAPLOTYPES, .keep_all = TRUE) %>%
    dplyr::group_by(dplyr::across(markers.metadata)) %>%
    dplyr::tally(.) %>%
    dplyr::filter(n > 2) %>%
    dplyr::rename(N_ALLELES = n)

  if (nrow(blacklist.markers) == 0) {
    message("No markers to blacklist")
    res$blacklist.markers <- "No markers blacklisted"
    res$blacklist.info <- "No markers blacklisted"
    res$biallelic.data <- data
    res$multiallelic.data <- NULL
  } else {
    # write the blacklist
    blacklist.filename <- stringi::stri_join("blacklist.markers.not.biallelic_", file.date, ".tsv")
    readr::write_tsv(x = blacklist.markers, file = blacklist.filename)

    res$blacklist.markers <- blacklist.markers


    exclude.vars <- purrr::keep(
      .x = colnames(data),
      .p = !colnames(data) %in% "GT_VCF_NUC")

    group.vars <- purrr::keep(
      .x = colnames(data),
      .p = colnames(data) %in% c("MARKERS", "CHROM", "LOCUS", "POS", "POP_ID"))

    # Generating biallelic/multiallelic datasets -------------------------------
    res$multiallelic.data <- dplyr::filter(data, MARKERS %in% blacklist.markers$MARKERS)
    res$biallelic.data <- dplyr::filter(data, !MARKERS %in% blacklist.markers$MARKERS)

    data <- NULL # no longer required

    # Blacklist additionnal info -----------------------------------------------
    blacklist.info <- dplyr::filter(res$multiallelic.data, GT_VCF_NUC != "./.") %>%
      separate_gt(x = ., gt = "GT_VCF_NUC", exclude = exclude.vars, split.chunks = 3L) %>%
      dplyr::select(-ALLELES_GROUP) %>%
      dplyr::group_by(dplyr::across(c(group.vars, "ALLELES"))) %>%
      dplyr::summarise(N = n()) %>%
      dplyr::ungroup(.) %>%
      dplyr::group_by_at(dplyr::vars(c(markers.metadata, "ALLELES"))) %>%
      dplyr::mutate(REF = sum(N)) %>%
      dplyr::ungroup(.) %>%
      dplyr::arrange(MARKERS, ALLELES)

    markers.low.mac <- blacklist.info %>%
      dplyr::group_by(dplyr::across(markers.metadata)) %>%
      dplyr::filter(REF == min(REF)) %>%
      dplyr::distinct(MARKERS, REF, .keep_all = TRUE) %>%
      dplyr::filter(REF <=1) %>%
      dplyr::ungroup(.) %>%
      nrow

    blacklist.info.filename <- stringi::stri_join("blacklist.markers.not.biallelic.info_", file.date, ".tsv")
    blacklist.info <- blacklist.info %>%
      dplyr::group_by(dplyr::across(markers.metadata)) %>%
      dplyr::mutate(REF = dplyr::if_else(REF == max(REF), "REF", "ALT")) %>%
      dplyr::ungroup(.) %>%
      dplyr::group_by(dplyr::across(c(markers.metadata, "ALLELES", "REF"))) %>%
      tidyr::pivot_wider(data = ., names_from = "POP_ID", values_from = "N") %>%
      readr::write_tsv(x = ., file = blacklist.info.filename, na = "-")
    res$blacklist.info <- blacklist.info
    dodgy.markers <- dplyr::n_distinct(blacklist.info$MARKERS)
    message("\nNumber of suspicious markers (> 2 alleles): ", dodgy.markers)
    message("Number of suspicious markers with 1 count for the ALT allele: ", markers.low.mac)
    message("Number of suspicious markers after low count removed: ", dodgy.markers - markers.low.mac)

    message("\nblacklist of markers with > 2 alleles:")
    message("    ", blacklist.filename)
    message("\nblacklist with more info:")
    message("    ", blacklist.info.filename)

    message("\nBiallelic and multiallelic tidy datasets objects generated")

    # if (tibble::has_name(data, "AVG_COUNT_REF")) {
    #   data3 <- data %>%
    #     dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
    #     dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
    #     dplyr::summarise(
    #       NUMBER_SAMPLES = n(),
    #       AVG_COUNT_REF = mean(AVG_COUNT_REF),
    #       AVG_COUNT_SNP = mean(AVG_COUNT_SNP)) %>%
    #     dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
    # } else if (tibble::has_name(data, "ALLELE_REF_DEPTH")) {
    #   data3 <- data %>%
    #     dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
    #     dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
    #     dplyr::summarise(NUMBER_SAMPLES = n()) %>%
    #     dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
    # } else {
    #   data3 <- data %>%
    #     dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
    #     dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
    #     dplyr::summarise(NUMBER_SAMPLES = n()) %>%
    #     dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
    # }
  }
  return(res)
}#End detect_biallelic_problems
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.