R/filter_snp_number.R

Defines functions filter_snp_number

Documented in filter_snp_number

# SNP number per haplotype
#' @name filter_snp_number
#' @title Filter SNP number per locus/read
#' @description This filter removes outlier markers
#' with too many SNP number per locus/read.
#' The data requires snp and locus information (e.g. from a VCF file).
#' Having a higher than "normal" SNP number is usually the results of
#' assembly artifacts or bad assembly parameters.
#' This filter is population-agnostic, but still requires a strata
#' file if a vcf file is used as input.
#'
#' \strong{Filter targets}: Markers
#'
#' \strong{Statistics}: The number of SNPs per locus.


# Most arguments are inherited from tidy_genomic_data
#' @inheritParams read_strata
#' @inheritParams radiator_common_arguments

#' @param filter.snp.number (integer) This is best decided after viewing the figures.
#' If the argument is set to 2, locus with 3 and more SNPs will be blacklisted.
#' Default: \code{filter.snp.number = NULL}.

#' @param filename (optional) Name of the filtered tidy data frame file
#' written to the working directory (ending with \code{.tsv})
#' Default: \code{filename = NULL}.


#' @rdname filter_snp_number
#' @export
#' @details
#' \strong{Interactive version}
#'
#' There are 2 steps in the interactive version to visualize and filter
#' the data based on the number of SNP on the read/locus:
#'
#' Step 1. SNP number per read/locus visualization
#'
#' Step 2. Choose the filtering thresholds
#'
#'
#' @return A list in the global environment with 6 objects:
#' \enumerate{
#' \item $snp.number.markers
#' \item $number.snp.reads.plot
#' \item $whitelist.markers
#' \item $tidy.filtered.snp.number
#' \item $blacklist.markers
#' \item $filters.parameters
#' }
#'
#' The object can be isolated in separate object outside the list by
#' following the example below.

#' @examples
#' \dontrun{
#' turtle.outlier.snp.number <- radiator::filter_snp_number(
#' data = "turtle.vcf",
#' strata = "turtle.strata.tsv",
#' max.snp.number = 4,
#' filename = "tidy.data.turtle.tsv"
#' )

#'
#' tidy.data <- turtle.outlier.snp.number$tidy.filtered.snp.number
#'
#' #Inside the same list, to isolate the markers blacklisted:
#' blacklist <- turtle.outlier.snp.number$blacklist.markers
#'
#' }

filter_snp_number <- function(
  data,
  strata = NULL,
  interactive.filter = TRUE,
  filter.snp.number = NULL,
  filename = NULL,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
) {

  # interactive.filter <- TRUE
  # data <- gds
  # path.folder <- "testing_snp_number"
  # force.stats <- TRUE
  # parameters <- NULL
  # filename <- NULL
  # filter.snp.number <- NULL
  # parallel.core <- parallel::detectCores() - 1
  # verbose = TRUE
  # obj.keeper <- c(ls(envir = globalenv()), "data")

  if (!is.null(filter.snp.number) || interactive.filter) {
    if (interactive.filter) verbose <- TRUE
    radiator_function_header(f.name = "filter_snp_number", verbose = verbose)

    # Cleanup-------------------------------------------------------------------
    file.date <- format(Sys.time(), "%Y%m%d@%H%M")
    if (verbose) message("Execution date@time: ", file.date)
    old.dir <- getwd()
    opt.change <- getOption("width")
    options(width = 70)
    timing <- radiator_tic()
    # res <- list()
    #back to the original directory and options
    on.exit(setwd(old.dir), add = TRUE)
    on.exit(options(width = opt.change), add = TRUE)
    on.exit(radiator_toc(timing), add = TRUE)
    on.exit(radiator_function_header(f.name = "filter_snp_number", start = FALSE, verbose = verbose), add = TRUE)
    # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))

    # Function call and dotslist -------------------------------------------------
    rad.dots <- radiator_dots(
      func.name = as.list(sys.call())[[1]],
      fd = rlang::fn_fmls_names(),
      args.list = as.list(environment()),
      dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
      keepers = c("path.folder", "parameters", "internal", "force.stats"),
      verbose = FALSE
    )

    # Checking for missing and/or default arguments ------------------------------
    if (missing(data)) rlang::abort("data is missing")

    # Folders---------------------------------------------------------------------
    path.folder <- generate_folder(
      f = path.folder,
      rad.folder = "filter_snp_number",
      internal = internal,
      file.date = file.date,
      verbose = verbose)

    # write the dots file
    write_rad(
      data = rad.dots,
      path = path.folder,
      filename = stringi::stri_join("radiator_filter_snp_number_args_", file.date, ".tsv"),
      tsv = TRUE,
      internal = internal,
      write.message = "Function call and arguments stored in: ",
      verbose = verbose
    )

    # Message about steps taken during the process ---------------------------------
    if (interactive.filter) {
      message("Interactive mode: on")
      message("2 steps to visualize and filter the data based on the number of SNP on the read/locus:")
      message("Step 1. Impact of SNP number per read/locus (on individual genotypes and locus/snp number potentially filtered)")
      message("Step 2. Choose the filtering thresholds")
    }

    # File type detection----------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)
    if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
      rlang::abort("Input not supported for this function: read function documentation")
    }


    if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
      radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

      if (data.type == "gds.file") {
        data <- radiator::read_rad(data, verbose = verbose)
        data.type <- "SeqVarGDSClass"
      }
    } else {
      if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
      data.type <- "tbl_df"
    }

    # Filter parameter file: initiate ------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = TRUE,
      initiate = TRUE,
      update = FALSE,
      parameter.obj = parameters,
      data = data,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # Whitelist and blacklist --------------------------------------------------
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "COL")
    if (data.type == "SeqVarGDSClass") {
      wl <- extract_markers_metadata(gds = data, whitelist = TRUE) # not optimal, currently used just to get locus info
    } else {
      wl <- bl <- dplyr::select(data, tidyselect::any_of(want))
    }
    # Check that required info is present in data: snp and locus
    if (!tibble::has_name(wl, "LOCUS") || !tibble::has_name(wl, "POS")) {
      problem.data <- "This filter requires dataset with SNP (POS) and LOCUS information (columns)"
      message("\n\n", problem.data)
      readr::write_lines(
        x = problem.data,
        file = file.path(path.folder, "README"))
      return(data)
    }

    # Generate snp per locus stats----------------------------------------------
    if (verbose) message("Generating statistics")
    if (data.type == "SeqVarGDSClass") {
      wl <- generate_stats(
        gds = data,
        individuals = FALSE,
        snp.per.locus = TRUE,
        missing = FALSE,
        heterozygosity = FALSE,
        coverage = FALSE,
        allele.coverage = FALSE,
        mac = FALSE,
        snp.position.read = FALSE,
        force.stats = force.stats,
        path.folder = path.folder,
        file.date = file.date,
        plot = FALSE,
        parallel.core = parallel.core
      )
      stats <- wl$m.stats
      wl <- bl <- wl$m.info
    } else {
      bl <- wl
      wl %<>%
        dplyr::group_by(LOCUS) %>%
        dplyr::mutate(SNP_PER_LOCUS = n()) %>%
        dplyr::ungroup(.)
      stats <- tibble_stats(
        x = wl$SNP_PER_LOCUS,
        group = "SNPs per locus")
    }

    if (tibble::has_name(wl, "COL")) {
      read.length <- max(wl$COL)
      if (verbose) message("\nWith max read length taken from data: ", read.length)

      if (verbose) message("    The max number of SNP per locus correspond to:")
      if (verbose) message("    1 SNP per ", round(read.length / stats[[6]]), " bp\n")
    } else {
      read.length <- NULL
    }


    # Generate box plot ---------------------------------------------------------
    snp.per.locus.fig <- boxplot_stats(
      data = stats,
      title =  "Number of SNPs per locus",
      subtitle = if (!is.null(read.length)) {
        stringi::stri_join("Read length (max): ", read.length, " bp", "\nOutlier: ", ceiling(stats[[9]]))
      } else {
        stringi::stri_join("\nOutlier: ", ceiling(stats[[9]]))
      },
      x.axis.title = NULL,
      y.axis.title = "Number of SNPs per locus",
      bp.filename = stringi::stri_join("snp_per_locus_", file.date, ".pdf"),
      path.folder = path.folder)


    # Distribution -------------------------------------------------------------
    d.plot <- wl %>%
      dplyr::distinct(LOCUS, SNP_PER_LOCUS) %>%
      ggplot2::ggplot(data = ., ggplot2::aes(factor(SNP_PER_LOCUS))) +
      ggplot2::geom_bar() +
      ggplot2::labs(x = "Number of SNPs per locus") +
      ggplot2::labs(y = "Distribution (number of locus)") +
      ggplot2::geom_vline(ggplot2::aes(xintercept = as.numeric(ceiling(stats[[9]]))), color = "yellow") +
      ggplot2::theme_bw()+
      ggplot2::theme(
        axis.title.x = ggplot2::element_text(size = 12, face = "bold"),
        axis.title.y = ggplot2::element_text(size = 12, face = "bold"),
        legend.title = ggplot2::element_text(size = 12, face = "bold"),
        legend.text = ggplot2::element_text(size = 12, face = "bold"),
        strip.text.x = ggplot2::element_text(size = 12, face = "bold"),
        axis.text.x = ggplot2::element_text(size = 12, angle = 90, hjust = 1, vjust = 0.5)
      )
    print(d.plot)

    # save
    d.plot.filename <- stringi::stri_join("snp_per_locus_distribution_", file.date, ".pdf")

    ggplot2::ggsave(
      filename = file.path(path.folder, d.plot.filename),
      plot = d.plot,
      width = 20, height = 10, dpi = 300, units = "cm", useDingbats = FALSE,
      limitsize = FALSE)


    # Helper table -------------------------------------------------------------
    if (verbose) message("Generating helper table...")
    how_many_markers <- function(threshold, x) {
      nrow(dplyr::filter(x, SNP_PER_LOCUS > threshold))
    }#End how_many_markers

    snp.range <- stats[[2]]:stats[[6]]
    n.markers <- nrow(wl)

    helper.table <- tibble::tibble(SNP_PER_LOCUS = snp.range) %>%
      dplyr::mutate(
        BLACKLISTED_MARKERS = purrr::map_int(
          .x = snp.range, .f = how_many_markers, x = wl),
        WHITELISTED_MARKERS = n.markers - BLACKLISTED_MARKERS
      ) %>%
      readr::write_tsv(
        x = .,
        file = file.path(path.folder, "snp.per.locus.helper.table.tsv"))

    # figures
    markers.plot <- ggplot2::ggplot(
      data = helper.table  %<>%
        tidyr::pivot_longer(
          data = .,
          cols = -SNP_PER_LOCUS,
          names_to = "LIST",
          values_to = "MARKERS"
        ),
      ggplot2::aes(x = SNP_PER_LOCUS, y = MARKERS)) +
      ggplot2::geom_line() +
      ggplot2::geom_point(size = 2, shape = 21, fill = "white") +
      ggplot2::geom_vline(ggplot2::aes(xintercept = as.numeric(ceiling(stats[[9]]))), color = "yellow") +
      ggplot2::scale_x_continuous(name = "Number of SNPs per locus allowed", breaks = snp.range) +
      ggplot2::scale_y_continuous(name = "Number of markers") +
      ggplot2::theme_bw()+
      ggplot2::theme(
        axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
        axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
        axis.text.x = ggplot2::element_text(size = 8, angle = 90, hjust = 1, vjust = 0.5)
      ) +
      ggplot2::facet_grid(LIST ~. , scales = "free", space = "free")
    print(markers.plot)

    # save
    ggplot2::ggsave(
      filename = file.path(path.folder, "snp.per.locus.helper.plot.pdf"),
      plot = markers.plot,
      width = 20 + (stats[[6]] / 10),
      height = 15,
      dpi = 300,
      units = "cm",
      useDingbats = FALSE,
      limitsize = FALSE)
    helper.table <- markers.plot <- NULL
    if (verbose) message("Files written: helper tables and plots")

    # Step 2. Thresholds selection ---------------------------------------------
    if (interactive.filter) {
      max.allowed <- stats[[6]]
      if (verbose) message("\nStep 2. Filtering markers based on the maximum of SNPs per locus\n")

      filter.snp.number <- radiator_question(
        x = "Do you still want to blacklist markers? (y/n):",
        answer.opt = c("y", "n"))

      if (filter.snp.number == "y") {
        outlier.stats <- radiator_question(
          x = "2 options to blacklist SNPs:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
        if (outlier.stats == 1) {
          filter.snp.number <- "outliers"
        } else {
          filter.snp.number <- radiator_question(
            x = "Enter the maximum number of SNP per locus allowed:", minmax = c(1, max.allowed))
        }
        outlier.stats <- NULL
      } else {
        filter.snp.number <- NULL
      }
    }

    # Filtering ----------------------------------------------------------------
    if (!is.null(filter.snp.number)) {
      if (!purrr::is_double(filter.snp.number)) {
        out.high <- round(stats$OUTLIERS_HIGH[stats$GROUP == "SNPs per locus"])
        if (verbose) message("\nRemoving outliers markers based on the number of SNPs per locus statistic: ", out.high)
        filter.snp.number <- out.high
      } else {
        if (verbose) message("\nRemoving markers based on the number of SNPs per locus statistic: ", filter.snp.number)
      }
    } else {
      filter.snp.number <- 1000000000000
    }

    # Whitelist and Blacklist of markers
    if (!is.null(filter.snp.number)) {
      wl %<>% dplyr::filter(SNP_PER_LOCUS <= filter.snp.number)
    }
    readr::write_tsv(
      x = wl,
      file = file.path(path.folder, "whitelist.snp.per.locus.tsv"),
      append = FALSE, col_names = TRUE)
    bl %<>% dplyr::setdiff(wl) %>% dplyr::mutate(FILTERS = "filter.snp.number")

    readr::write_tsv(
      x = bl,
      file = file.path(path.folder, "blacklist.snp.per.locus.tsv"),
      append = FALSE, col_names = TRUE)
    # saving whitelist and blacklist
    if (verbose) message("File written: whitelist.markers.genotyping.tsv")
    if (verbose) message("File written: blacklist.markers.genotyping.tsv")

    if (data.type == "SeqVarGDSClass") {

      markers.meta <- extract_markers_metadata(gds = data) %>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            VARIANT_ID %in% bl$VARIANT_ID, "filter.snp.number", FILTERS
          )
        )

      # Update GDS
      update_radiator_gds(
        gds = data,
        node.name = "markers.meta",
        value = markers.meta,
        sync = TRUE
      )



    } else {
      # Apply the filter to the tidy data
      data  %<>% dplyr::filter(MARKERS %in% wl$MARKERS)
    }

    # Update parameters --------------------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = FALSE,
      initiate = FALSE,
      update = TRUE,
      parameter.obj = filters.parameters,
      data = data,
      filter.name = "Filter markers snp number",
      param.name = "filter.snp.number",
      values = filter.snp.number,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # results ------------------------------------------------------------------
    radiator_results_message(
      rad.message = stringi::stri_join("\nFilter SNPs per locus threshold: ",
                                       filter.snp.number),
      filters.parameters,
      internal,
      verbose
    )
  }
  return(data)
} #End filter_snp_number
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.