R/filter_whitelist.R

Defines functions generate_whitelist filter_whitelist read_whitelist

Documented in filter_whitelist generate_whitelist read_whitelist

# read_whitelist ---------------------------------------------------------------
#' @name read_whitelist
#' @title read whitelist of markers
#' @description Read a whitelist object or file.
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param whitelist.markers (path or object)
#' The whitelist is an object in your
#' global environment or a file in the working directory (e.g. "whitelist.txt").
#' The dataframe contains one, a combination
#' or all of these columns: \code{MARKERS, CHROM, LOCUS, POS}.
#' Columns are cleaned of separators that interfere with some packages or codes, detailed in
#' \code{\link{clean_markers_names}}.
#' Default \code{whitelist.markers = NULL}.
#'
#' @inheritParams radiator_common_arguments

#' @details
#'
#' \strong{multi allelic datasets:}
#' Example: VCF with haplotypes containing \code{CHROM, LOCUS, POS} columns:
#' If the whitelist was not created from the same dataset,
#' the filtering could result in losing all the markers.
#' The POS column is different in biallelic and multiallelic file...


#' @examples
#' \dontrun{
#' wl <- radiator::read_whitelist(data = data, whitelist.markers = "mywhitelist.tsv")
#' }
#' @seealso \code{\link{filter_whitelist}}.
#' @export
#' @rdname read_whitelist
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


read_whitelist <- function(whitelist.markers = NULL, verbose = FALSE) {
  if (!is.null(whitelist.markers)) {
    # Import whitelist of markers
    if (is.vector(whitelist.markers)) {
      whitelist.markers <- suppressMessages(
        readr::read_tsv(whitelist.markers, col_names = TRUE) %>%
          dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = as.character))
      )
    }
    nrow.before <- nrow(whitelist.markers)
    whitelist.markers <- dplyr::distinct(whitelist.markers)
    nrow.after <- nrow(whitelist.markers)
    duplicate.whitelist.markers <- nrow.before - nrow.after
    if (duplicate.whitelist.markers > 0) {
      if (verbose) message("Duplicated rows in whitelist of markers: ", duplicate.whitelist.markers)
      if (verbose) message("    Creating unique whitelist")
      if (verbose) message("    Warning: downstream results might be impacted by this, check how you made your whitelist...")
    }
    nrow.before <- duplicate.whitelist.markers <- nrow.after <- NULL

    # cleaning names of markers
    whitelist.markers <- dplyr::mutate(
      .data = whitelist.markers,
      dplyr::across(tidyselect::everything(), .fns = clean_markers_names)
    )

    if (tibble::has_name(whitelist.markers, "VARIANT_ID")) {
      whitelist.markers$VARIANT_ID <- as.integer(whitelist.markers$VARIANT_ID)
    }

  } else {
    whitelist.markers <- NULL
  }
  return(whitelist.markers)
}#End


# filter_whitelist ---------------------------------------------------------------
#' @name filter_whitelist
#' @title Filter dataset with whitelist of markers
#' @description  Filter dataset with whitelist of markers
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#'
#' @inheritParams radiator_common_arguments
#' @inheritParams read_whitelist

#' @examples
#' \dontrun{
#' data <- radiator::filter_whitelist(data = data, whitelist.markers = "mywhitelist.tsv")
#' }

#' @export
#' @rdname filter_whitelist
#' @seealso \code{\link{read_whitelist}}.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


filter_whitelist <- function(
  data,
  whitelist.markers = NULL,
  verbose = TRUE,
  ...) {
  # obj.keeper <- c(ls(envir = globalenv()), "data")
  if (is.null(whitelist.markers)) return(data)
  radiator_function_header(f.name = "filter_whitelist", verbose = verbose)

  # Cleanup---------------------------------------------------------------------
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "filter_whitelist", start = FALSE, verbose = verbose), add = TRUE)
  # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("Input file missing")

  # Function call and dotslist -------------------------------------------------
  rad.dots <- radiator_dots(
    func.name = as.list(sys.call())[[1]],
    fd = rlang::fn_fmls_names(),
    args.list = as.list(environment()),
    dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
    keepers = c("path.folder", "parameters", "biallelic", "markers.meta", "internal"),
    verbose = verbose
  )

  # Folders---------------------------------------------------------------------
  path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "filter_whitelist",
    internal = internal,
    file.date = file.date,
    verbose = verbose)

  # write the dots file
  write_rad(
    data = rad.dots,
    path = path.folder,
    filename = stringi::stri_join(
      "radiator_filter_whitelist_args_", file.date, ".tsv"),
    tsv = TRUE,
    internal = internal,
    verbose = verbose
  )

  # read whitelist
  whitelist.markers <- radiator::read_whitelist(
    whitelist.markers = whitelist.markers,
    verbose = verbose)

  n.markers.w <- nrow(whitelist.markers)

  if (verbose) message("Whitelist of markers: ", n.markers.w)

  # Import data ---------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)
  if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
    rlang::abort("Input not supported for this function: read function documentation")
  }

  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }
  } else {
    if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
    data.type <- "tbl_df"
  }


  # Check biallelic ----------------------------------------------------------
  if (is.null(biallelic)) biallelic <- detect_biallelic_markers(data = data)
  if (!biallelic) {
    if (ncol(whitelist.markers) >= 3) {
      if (verbose) message("Note: whitelist with CHROM LOCUS POS columns and VCF haplotype:
                If the whitelist was not created from this VCF,
                the filtering could result in losing all the markers.
                The POS column is different in biallelic and multiallelic file...\n")

      if (verbose) message("Discarding the POS column in the whitelist if present")
      if (tibble::has_name(whitelist.markers, "POS")) {
        whitelist.markers  %<>%  dplyr::select(-POS)
      }
    }
  }

  # Filter parameter file: generate and initiate -----------------------------
  filters.parameters <- radiator_parameters(
    generate = TRUE,
    initiate = TRUE,
    update = FALSE,
    parameter.obj = parameters,
    data = data,
    path.folder = path.folder,
    file.date = file.date,
    internal = internal,
    verbose = verbose)

  # filtering ----------------------------------------------------------------
  # tidy data ----------------------------------------------------------------
  if (data.type == "tbl_df") {
    data <- suppressWarnings(dplyr::semi_join(
      data,
      whitelist.markers,
      by = intersect(colnames(data), colnames(whitelist.markers))))

    if (nrow(data) == 0) {
      rlang::abort("No markers left in the dataset, check whitelist...")
    }
  }

  # GDS
  if (data.type == "SeqVarGDSClass") {
    # extract all the markers
    markers.meta <- extract_markers_metadata(
      gds = data,
      whitelist = FALSE
    )

    if (rlang::has_name(whitelist.markers, "VARIANT_ID") && rlang::has_name(markers.meta, "VARIANT_ID")) {
      use.variant <- TRUE
      use.markers <- use.others <- FALSE
    } else if (rlang::has_name(whitelist.markers, "MARKERS") && rlang::has_name(markers.meta, "MARKERS")) {
      use.variant <- use.others <- FALSE
      use.markers <- TRUE
      FALSE
    } else {
      use.variant <- use.markers <- FALSE
      use.others <- TRUE
    }


    if (use.variant) {
      markers.meta %<>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            !VARIANT_ID %in% whitelist.markers$VARIANT_ID &
              FILTERS == "whitelist",
            "filter.whitelist",
            FILTERS
          )
        )
    }


    if (use.markers) {
      markers.meta %<>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            !MARKERS %in% whitelist.markers$MARKERS &
              FILTERS == "whitelist",
            "filter.whitelist",
            FILTERS
          )
        )
    }

    if (use.others) {
      common.meta <- rlang::syms((intersect(colnames(markers.meta), colnames(whitelist.markers))))
      whitelist.markers %<>%
        dplyr::mutate(
          .data = .,
          COMMON_META = stringi::stri_join(!!!common.meta, sep = "__")
        )

      test <- markers.meta %>%
        dplyr::mutate(
          .data = .,
          COMMON_META = stringi::stri_join(!!!common.meta, sep = "__")
        ) %>%
        dplyr::filter(COMMON_META %in% whitelist.markers$COMMON_META)

      #   FILTERS = dplyr::if_else(
      #     !COMMON_META %in% whitelist.markers$COMMON_META &
      #       FILTERS == "whitelist",
      #     "filter.whitelist",
      #     FILTERS
      #   )
      #   # COMMON_META = NULL
      # )
      common.meta <- NULL
    }


    if (nrow(dplyr::filter(markers.meta, FILTERS == "whitelist")) == 0) {
      rlang::abort("No markers left in the dataset, check whitelist...")
    }

    update_radiator_gds(
      gds = data,
      node.name = "markers.meta",
      value = markers.meta,
      replace = TRUE,
      sync = TRUE,
      verbose = TRUE
    )

    write_rad(
      data = markers.meta %>% dplyr::filter(FILTERS == "filter.whitelist"),
      path = path.folder,
      filename = stringi::stri_join("blacklist.markers_", file.date, ".tsv"),
      tsv = TRUE, internal = internal, verbose = verbose)

    write_rad(
      data = markers.meta %>% dplyr::filter(FILTERS == "whitelist"),
      path = path.folder,
      filename = stringi::stri_join("whitelist.markers_", file.date, ".tsv"),
      tsv = TRUE, internal = internal, verbose = verbose)
  } # End GDS

  # Filter parameter file: update --------------------------------------------
  filters.parameters <- radiator_parameters(
    generate = FALSE,
    initiate = FALSE,
    update = TRUE,
    parameter.obj = filters.parameters,
    data = data,
    filter.name = "whitelist markers",
    param.name = "whitelist.markers",
    values = n.markers.w,
    path.folder = path.folder,
    file.date = file.date,
    internal = internal,
    verbose = verbose)
  # results ------------------------------------------------------------------
  radiator_results_message(
    rad.message = stringi::stri_join("\nFilter whitelist : ", n.markers.w, "SNPs"),
    filters.parameters,
    internal,
    verbose
  )
  return(data)
}#End filter_whitelist



#' @title generate_whitelist
#' @description Generate Whitelist of markers automatically for different threshold
#' @rdname generate_whitelist
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

generate_whitelist <- function(x, t, path.folder = NULL) {
  if (is.null(path.folder)) path.folder <- getwd()

  x <- dplyr::ungroup(x) %>%
    dplyr::filter(N_POP_HWD == hw.pop.threshold)

  bl_map <- function(x, unfiltered.data, hw.pop.threshold,
                     path.folder, filters.parameters.path) {

    if (nrow(x) > 0) {
      significance.group <- unique(x$SIGNIFICANCE)

      significance.group <- dplyr::case_when(
        significance.group == "*****" ~ 0.00001,
        significance.group == "****" ~ 0.0001,
        significance.group == "***" ~ 0.001,
        significance.group == "**" ~ 0.01,
        significance.group == "*" ~ 0.05) %>%
        format(., scientific = FALSE)

      blacklist.filename <- file.path(
        path.folder,
        stringi::stri_join(
          "blacklist.markers.hwd", significance.group, "mid.p.value",
          hw.pop.threshold, "hw.pop.threshold", "tsv", sep = "."))

      whitelist.filename <- file.path(
        path.folder,
        stringi::stri_join(
          "whitelist.markers.hwe", significance.group, "mid.p.value",
          hw.pop.threshold, "hw.pop.threshold", "tsv", sep = "."))

      rad.filename <- file.path(
        path.folder,
        stringi::stri_join(
          "tidy.filtered.hwe", significance.group, "mid.p.value",
          hw.pop.threshold, "hw.pop.threshold", "rad", sep = "."))

      # Generate the blacklist
      want <- c("MARKERS", "CHROM", "LOCUS", "POS")
      blacklist <- dplyr::distinct(x, MARKERS) %>%
        dplyr::left_join(
          dplyr::select(unfiltered.data, tidyselect::any_of(want)) %>%
            dplyr::distinct(MARKERS, .keep_all = TRUE)
          , by = "MARKERS") %>%
        readr::write_tsv(
          x = .,
          file = blacklist.filename)


      # Generate the rad data + the whitelist
      if (!is.null(data.temp)) {
        whitelist <- unfiltered.data %>%
            dplyr::bind_rows(data.temp) %>%
            dplyr::mutate(POP_ID = factor(POP_ID, levels = pop.id.levels)) %>%
            dplyr::filter(!MARKERS %in% blacklist$MARKERS) %>%
            radiator::write_rad(data = ., path = rad.filename) %>%
            dplyr::select(tidyselect::any_of(want)) %>%
            dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
            readr::write_tsv(x = ., file = whitelist.filename)
      } else {
        whitelist <- unfiltered.data %>%
          dplyr::filter(!MARKERS %in% blacklist$MARKERS) %>%
          radiator::write_rad(data = ., path = rad.filename) %>%
          dplyr::select(tidyselect::any_of(want)) %>%
          dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
          readr::write_tsv(x = ., file = whitelist.filename)
      }

      fil.param <- update_filter_parameter(
        filter = "HWE",
        unfiltered = unfiltered.data,
        filtered = whitelist,
        parameter = "hw.pop.threshold/mid.p.value",
        threshold = stringi::stri_join(hw.pop.threshold, significance.group, sep = "/"),
        param.path = filters.parameters.path)
    }
  }#End bl_map
  split(x = x, f = x$SIGNIFICANCE) %>%
    purrr::walk(
      .x = ., .f = bl_map,
      unfiltered.data = unfiltered.data,
      hw.pop.threshold = hw.pop.threshold,
      path.folder = path.folder,
      filters.parameters.path = filters.parameters.path)
  fil.param <- readr::read_tsv(file = filters.parameters.path, col_types = "cccccccc") %>%
    dplyr::filter(FILTERS == "HWE")
  return(fil.param)
}#End generate_whitelist
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.