R/filter_blacklist_genotypes.R

Defines functions filter_blacklist_genotypes read_blacklist_genotypes

Documented in filter_blacklist_genotypes read_blacklist_genotypes

# read_blacklist_genotypes ---------------------------------------------------------------
#' @name read_blacklist_genotypes
#' @title read blacklist of genotypes
#' @description Read a blacklist object or file.
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param blacklist.genotypes (path or object)
#' The blacklist is an object in your
#' global environment or a file in the working directory (e.g. "blacklist.geno.tsv").
#' The dataframe contains at least these 2 columns: \code{MARKERS, INDIVIDUALS}.
#' Additional columns are allowed: \code{CHROM, LOCUS, POS}.
#'
#' Useful to erase genotypes with bad QC, e.g. genotype with more than 2 alleles
#' in diploid likely
#' sequencing errors or genotypes with poor genotype likelihood or coverage.
#'
#' Columns are cleaned of separators that interfere with some packages or codes, detailed in
#' \code{\link{clean_markers_names}} and \code{\link{clean_ind_names}}
#' Default \code{blacklist.genotypes = NULL}.
#'
#' @inheritParams radiator_common_arguments
#' @examples
#' \dontrun{
#' bl <- radiator::read_blacklist_genotypes(data = data,
#'     blacklist.genotypes = "blacklist.geno.iguana.tsv")
#' }

#' @section Life cycle:
#'
#' This function arguments will be subject to changes. Currently the function uses
#' erase.genotypes, but using the \code{dots-dots-dots ...} arguments allows to
#' pass \code{erase.genotypes and masked.genotypes}. These arguments do exactly
#' the same thing and only one can be used.

#' @export
#' @rdname read_blacklist_genotypes
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


read_blacklist_genotypes <- function(
  blacklist.genotypes = NULL,
  verbose = FALSE,
  ...
) {
  # dotslist -------------------------------------------------------------------
  dotslist <- rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE)
  want <- c("erase.genotypes", "masked.genotypes")
  unknowned_param <- setdiff(names(dotslist), want)

  if (length(unknowned_param) > 0) {
    rlang::abort("Unknowned \"...\" parameters ",
                 stringi::stri_join(unknowned_param, collapse = " "))
  }

  radiator.dots <- dotslist[names(dotslist) %in% want]
  erase.genotypes <- radiator.dots[["erase.genotypes"]]
  masked.genotypes <- radiator.dots[["masked.genotypes"]]

  if (!is.null(erase.genotypes) && !is.null(masked.genotypes) && !is.null(blacklist.genotypes)) {
    rlang::abort("Only one of: erase.genotypes, masked.genotypes or blacklist.genotypes should be used")
  }

  if (!is.null(erase.genotypes) && is.null(blacklist.genotypes)) {
    blacklist.genotypes <- erase.genotypes
  }
  if (!is.null(masked.genotypes) && is.null(blacklist.genotypes)) {
    blacklist.genotypes <- masked.genotypes
  }

  if (!is.null(blacklist.genotypes)) {
    # Import whitelist of markers
    if (is.vector(blacklist.genotypes)) {
      blacklist.genotypes <- suppressMessages(
        readr::read_tsv(blacklist.genotypes, col_names = TRUE) %>%
          dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = as.character))
      )
    }
    nrow.before <- nrow(blacklist.genotypes)
    blacklist.genotypes <- dplyr::distinct(blacklist.genotypes)
    nrow.after <- nrow(blacklist.genotypes)
    duplicate.blacklist.genotypes <- nrow.before - nrow.after
    if (duplicate.blacklist.genotypes > 0) {
      if (verbose) message("Duplicated rows in blacklist genotypes: ", duplicate.blacklist.genotypes)
      if (verbose) message("    Creating unique blacklist")
      if (verbose) message("    Warning: downstream results might be impacted by this, check how you made your blacklist")
    }
    nrow.before <- duplicate.blacklist.genotypes <- nrow.after <- NULL

    # cleaning names of markers
    need.cleaning <- purrr::discard(
      .x = colnames(blacklist.genotypes),
      .p = colnames(blacklist.genotypes) %in% "INDIVIDUALS")

    blacklist.genotypes <- dplyr::mutate(
      .data = blacklist.genotypes,
      dplyr::across(
        .cols = need.cleaning,
        .fns = clean_markers_names
      )
    )

    blacklist.genotypes <- dplyr::mutate(
      .data = blacklist.genotypes,
      dplyr::across(
        .cols = "INDIVIDUALS",
        .fns = clean_ind_names
      )
    )
  } else {
    blacklist.genotypes <- NULL
  }
  return(blacklist.genotypes)
}#End


# filter_blacklist_genotypes ---------------------------------------------------------------
#' @name filter_blacklist_genotypes
#' @title Filter dataset with blacklist of genotypes
#' @description  Filter dataset with blacklist of genotypes.
#'
#' This function allows to blacklist/erase/mask genotypes.
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#'
#' @inheritParams radiator_common_arguments
#' @inheritParams read_blacklist_genotypes

#' @examples
#' \dontrun{
#' data <- radiator::filter_blacklist_genotypes(
#'     data = data, blacklist.geno = "blacklist.geno.tsv"
#'     )
#' }

#' @section Life cycle:
#'
#' This function arguments will be subject to changes. Currently the function uses
#' erase.genotypes, but using the \code{dots-dots-dots ...} arguments allows to
#' pass \code{erase.genotypes and masked.genotypes}. These arguments do exactly
#' the same thing and only one can be used.

#' @export
#' @rdname filter_blacklist_genotypes
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


filter_blacklist_genotypes <- function(
  data,
  blacklist.genotypes,
  verbose = TRUE,
  ...
) {

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("Input file missing")
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")# Date and time

  # dotslist -------------------------------------------------------------------
  dotslist <- rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE)
  want <- c("path.folder", "parameters", "erase.genotypes", "masked.genotypes")
  unknowned_param <- setdiff(names(dotslist), want)

  if (length(unknowned_param) > 0) {
    rlang::abort("Unknowned \"...\" parameters ",
                 stringi::stri_join(unknowned_param, collapse = " "))
  }

  radiator.dots <- dotslist[names(dotslist) %in% want]
  parameters <- radiator.dots[["parameters"]]
  erase.genotypes <- radiator.dots[["erase.genotypes"]]
  masked.genotypes <- radiator.dots[["masked.genotypes"]]
  path.folder <- radiator.dots[["path.folder"]]
  path.folder <- generate_folder(f = path.folder, file.date = file.date, verbose = verbose)

  if (!is.null(erase.genotypes) && !is.null(masked.genotypes) && !is.null(blacklist.genotypes)) {
    rlang::abort("Only one of: erase.genotypes, masked.genotypes or blacklist.genotypes should be used")
  }

  if (!is.null(erase.genotypes) && is.null(blacklist.genotypes)) {
    blacklist.genotypes <- erase.genotypes
  }
  if (!is.null(masked.genotypes) && is.null(blacklist.genotypes)) {
    blacklist.genotypes <- masked.genotypes
  }


  # read whitelist
  blacklist.genotypes <- radiator::read_blacklist_genotypes(
    blacklist.genotypes = blacklist.genotypes,
    verbose = verbose)

  if (!is.null(blacklist.genotypes)) {
    # Import data ---------------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)
    if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
      rlang::abort("Input not supported for this function: read function documentation")
    }

    if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
      radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

      if (data.type == "gds.file") {
        data <- radiator::read_rad(data, verbose = verbose)
        data.type <- "SeqVarGDSClass"
      }
    } else {
      if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
      data.type <- "tbl_df"
    }

    # Filter parameter file: generate and initiate -----------------------------

    # TODO: modify the function to accomodate erasing genotypes...

    # filters.parameters <- radiator_parameters(
    #   generate = TRUE,
    #   initiate = TRUE,
    #   update = FALSE,
    #   parameter.obj = parameters,
    #   data = data,
    #   path.folder = path.folder,
    #   file.date = file.date,
    #   verbose = verbose)

    # checks -------------------------------------------------------------------
    # check to keep only individuals present in the dataset and not already blacklisted
    # check that markers bl are in dataset (not already blacklisted elsewhere...)
    if (verbose) message("Checking matching individuals and markers between blacklist and data")
    if (data.type == "tbl_df") {
      id <- unique(data$INDIVIDUALS)
      wl <- unique(data$MARKERS)
    } else {
      radiator.gds <- gdsfmt::index.gdsn(
        node = data, path = "radiator", silent = TRUE)

      id <- gdsfmt::index.gdsn(
        node = radiator.gds, path = "individuals", silent = TRUE)

      if (!is.null(id)) {
        id <- gdsfmt::read.gdsn(id) %$% INDIVIDUALS
      } else {
        id <- SeqArray::seqGetData(gdsfile = data, var.name = "sample.id")
      }
      wl <- radiator::extract_markers_metadata(gds = data, whitelist = TRUE) %$% MARKERS
    }
    blacklist.genotypes  %<>% dplyr::filter(INDIVIDUALS %in% id) %>%
      dplyr::filter(MARKERS %in% wl)
    n.markers.bl <- nrow(blacklist.genotypes)

    if (n.markers.bl > 0) {
      if (verbose) message("Blacklisted genotypes: ", n.markers.bl)

      if (data.type == "tbl_df") {

        blacklist.genotypes <- dplyr::semi_join(
          data,
          blacklist.genotypes,
          by = intersect(colnames(data), colnames(blacklist.genotypes)))

        if (tibble::has_name(blacklist.genotypes, "GT")) {
          blacklist.genotypes %<>% dplyr::mutate(GT = rep("000000", n()))
        }

        if (tibble::has_name(blacklist.genotypes, "GT_VCF")) {
          blacklist.genotypes %<>% dplyr::mutate(GT_VCF = rep("./.", n()))
        }

        if (tibble::has_name(blacklist.genotypes, "GT_VCF_NUC")) {
          blacklist.genotypes %<>% dplyr::mutate(GT_VCF_NUC = rep("./.", n()))
        }

        if (tibble::has_name(blacklist.genotypes, "GT_BIN")) {
          blacklist.genotypes %<>% dplyr::mutate(GT_BIN = rep(as.numeric(NA_character_), n()))
        }

        data  %<>% dplyr::anti_join(
          blacklist.genotypes,
          by = intersect(colnames(data), colnames(blacklist.genotypes)))
        data  %<>% dplyr::bind_rows(blacklist.genotypes)

        # TODO and optimize...
        # required because REF/ALT might change after deleting genotypes...
        data  %<>% radiator::calibrate_alleles(data = .) %$% input

        # GDS
        if (data.type == "SeqVarGDSClass") {
          message("Under construction...")

        } # End GDS

      } else {
        message("There are no genotype left in the blacklist")
      }


      # Filter parameter file: update --------------------------------------------
      # filters.parameters <- radiator_parameters(
      #   generate = FALSE,
      #   initiate = FALSE,
      #   update = TRUE,
      #   parameter.obj = filters.parameters,
      #   data = data,
      #   filter.name = "whitelist markers",
      #   param.name = "whitelist.markers",
      #   values = n.markers.w,
      #   path.folder = path.folder,
      #   file.date = file.date,
      #   verbose = verbose)

      # if (verbose) {
      #   message("Number of individuals / strata / chrom / locus / SNP:")
      #   message("    Before: ", filters.parameters$filters.parameters$BEFORE)
      #   message("    Blacklisted: ", filters.parameters$filters.parameters$BLACKLIST)
      #   message("    After: ", filters.parameters$filters.parameters$AFTER)
      # }
    }# End !is.null
  }
  return(data)
}#End filter_blacklist_genotypes



# The section that was in tidy_genomic_data
# if (is.null(blacklist.genotype)) { # no Whitelist
#   if (verbose) message("Erasing genotype: no")
# } else {
#   if (verbose) message("Erasing genotype: yes")
#   want <- c("MARKERS", "CHROM", "LOCUS", "POS", "INDIVIDUALS")
#   if (is.vector(blacklist.genotype)) {
#     suppressWarnings(suppressMessages(
#       blacklist.genotype <- readr::read_tsv(blacklist.genotype, col_names = TRUE)))
#   }
#   suppressWarnings(suppressMessages(
#     blacklist.genotype <- blacklist.genotype %>%
#       dplyr::mutate(dplyr::across(.cols = "INDIVIDUALS", .fns = clean_ind_names)) %>%dplyr::mutate(dplyr::across(everything(), .fns = as.character))
#       dplyr::select(tidyselect::any_of(want)) %>%
# dplyr::mutate(dplyr::across(everything(), .fns = as.character, exclude = NA))
#   columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#
#   if (data.type == "haplo.file") {
#     blacklist.genotype <- dplyr::select(.data = blacklist.genotype, INDIVIDUALS, LOCUS)
#     columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#   }
#
#   # control check to keep only individuals in the strata.df
#   blacklist.genotype <- suppressWarnings(
#     blacklist.genotype  %>%
#       dplyr::filter(INDIVIDUALS %in% strata.df$INDIVIDUALS)
#   )
#
#   # control check to keep only whitelisted markers from the blacklist of genotypes
#   if (!is.null(whitelist.markers)) {
#     if (verbose) message("Control check to keep only whitelisted markers present in the blacklist of genotypes to erase.")
#     # updating the whitelist of markers to have all columns that id markers
#     if (data.type == "vcf.file") {
#       whitelist.markers.ind <- input %>% dplyr::distinct(CHROM, LOCUS, POS, INDIVIDUALS)
#     } else {
#       whitelist.markers.ind <- input %>% dplyr::distinct(LOCUS, INDIVIDUALS)
#     }
#
#     # updating the blacklist.genotype
#     blacklist.genotype <- suppressWarnings(
#       dplyr::semi_join(whitelist.markers.ind, blacklist.genotype,
#                        by = columns.names.blacklist.genotype))
#     columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#   }
#
#   # Update column names
#   columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#
#   blacklisted.gen.number <- nrow(blacklist.genotype)
#   if (blacklisted.gen.number > 0) {
#     message("    Number of genotype(s) to erase: ", blacklisted.gen.number)
#     input.erase <- dplyr::semi_join(
#       input, blacklist.genotype, by = columns.names.blacklist.genotype) %>%
#       dplyr::mutate(GT = rep("000000", n()))
#     input <- dplyr::anti_join(
#       input, blacklist.genotype, by = columns.names.blacklist.genotype)
#     if (rlang::has_name(input.erase, "GT_VCF")) {
#       input.erase <- dplyr::mutate(input.erase, GT_VCF = rep("./.", n()))
#     }
#
#     if (rlang::has_name(input.erase, "GT_VCF_NUC")) {
#       input.erase <- dplyr::mutate(input.erase, GT_VCF_NUC = rep("./.", n()))
#     }
#
#     if (rlang::has_name(input.erase, "GT_BIN")) {
#       input.erase <- dplyr::mutate(input.erase, GT_BIN = rep(as.numeric(NA_character_), n()))
#     }
#     input <- dplyr::bind_rows(input, input.erase)
#   } else {
#     message("There are no genotype left in the blacklist: input file left intact")
#   }
#
#   # required because REF/ALT might change after deleting genotypes...
#   input <- radiator::calibrate_alleles(data = input)$input
# } # End erase genotypes
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.