R/filter_snp_position_read.R

Defines functions filter_snp_position_read

Documented in filter_snp_position_read

# SNP number per haplotype
#' @name filter_snp_position_read
#' @title Filter markers/SNP based on their position on the read
#' @description This filter removes markers/SNPs based on their position on the read.
#' The data requires snp, locus and col information (e.g. from a VCF file).
#'
#' The impact of assembly artifacts can be tested in downstream analysis with
#' the whitelist and blacklist generated by this function.
#'
#'
#' \strong{Filter targets}: Markers
#'
#' \strong{Statistics}: The position of the SNPs on the read.
#'


#' @inheritParams read_strata
#' @inheritParams radiator_common_arguments

#' @param filter.snp.position.read (character)
#' Options are: \code{"outliers", "q75", "iqr", c(min value,max value)}.
#' For a safe and conservative
#' value, use \code{"outliers"}, this will remove SNPs with outlier position on
#' the reads.
#' Default: \code{filter.snp.read.position = NULL}.

#' @param filename (optional) Name of the filtered tidy data frame file
#' written to the working directory (ending with \code{.tsv})
#' Default: \code{filename = NULL}.


#' @rdname filter_snp_position_read
#' @export
#' @details
#' \strong{Interactive version}
#'
#' There are 2 steps in the interactive version to visualize and filter
#' the data based on the number of SNP on the read/locus:
#'
#' Step 1. SNP number per read/locus visualization
#'
#' Step 2. Choose the filtering thresholds
#'
#'
#' @return A list in the global environment with 6 objects:
#' \enumerate{
#' \item $snp.number.markers
#' \item $number.snp.reads.plot
#' \item $whitelist.markers
#' \item $tidy.filtered.snp.number
#' \item $blacklist.markers
#' \item $filters.parameters
#' }
#'
#' The object can be isolated in separate object outside the list by
#' following the example below.

#' @examples
#' \dontrun{
#' turtle <- radiator::filter_snp_position_read(
#' data = "turtle.vcf",
#' strata = "turtle.strata.tsv",
#' filter.snp.position.read = "outliers",
#' filename = "tidy.data.turtle.tsv"
#' )
#' }

filter_snp_position_read <- function(
  data,
  strata = NULL,
  interactive.filter = TRUE,
  filter.snp.position.read = NULL,
  filename = NULL,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
) {
  # TEST
  # data <- gds
  # interactive.filter <- TRUE
  # path.folder <- "testing_snp_position"
  # force.stats <- TRUE
  # parameters <- NULL
  # filename <- NULL
  # filter.snp.position.read <- "outliers"
  # parallel.core <- parallel::detectCores() - 1
  # verbose = TRUE
  # obj.keeper <- c(ls(envir = globalenv()), "data")

  if (!is.null(filter.snp.position.read) || interactive.filter) {
    if (interactive.filter) verbose <- TRUE
    radiator_function_header(f.name = "filter_snp_position_read", verbose = verbose)

    # Cleanup-------------------------------------------------------------------
    file.date <- format(Sys.time(), "%Y%m%d@%H%M")
    timing <- radiator_tic()
    if (verbose) message("Execution date@time: ", file.date)
    old.dir <- getwd()
    opt.change <- getOption("width")
    options(width = 70)
    on.exit(setwd(old.dir), add = TRUE)
    on.exit(options(width = opt.change), add = TRUE)
    on.exit(radiator_toc(timing), add = TRUE)
    on.exit(radiator_function_header(f.name = "filter_snp_position_read", start = FALSE, verbose = verbose), add = TRUE)
    # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))

    # Function call and dotslist -------------------------------------------------
    rad.dots <- radiator_dots(
      func.name = as.list(sys.call())[[1]],
      fd = rlang::fn_fmls_names(),
      args.list = as.list(environment()),
      dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
      keepers = c("path.folder", "parameters", "internal"),
      verbose = FALSE
    )

    # Checking for missing and/or default arguments ------------------------------
    if (missing(data)) rlang::abort("data is missing")

    # Folders---------------------------------------------------------------------
    path.folder <- generate_folder(
      f = path.folder,
      rad.folder = "filter_snp_position_read",
      internal = internal,
      file.date = file.date,
      verbose = verbose)

    # write the dots file
    write_rad(
      data = rad.dots,
      path = path.folder,
      filename = stringi::stri_join("radiator_filter_snp_position_read_args_", file.date, ".tsv"),
      tsv = TRUE,
      internal = internal,
      write.message = "Function call and arguments stored in: ",
      verbose = verbose
    )

    # Message about steps taken during the process ---------------------------------
    if (interactive.filter) {
      message("2 steps to visualize and filter the data based on the number of SNP on the read/locus:")
      message("Step 1. Visualization (boxplot, distribution")
      message("Step 2. Threshold selection")
    }

    # File type detection----------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)
    if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
      rlang::abort("Input not supported for this function: read function documentation")
    }


    if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
      radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

      if (data.type == "gds.file") {
        data <- radiator::read_rad(data, verbose = verbose)
        data.type <- "SeqVarGDSClass"
      }
    } else {
      if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
      data.type <- "tbl_df"
    }

    # Filter parameter file: initiate ------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = TRUE,
      initiate = TRUE,
      update = FALSE,
      parameter.obj = parameters,
      data = data,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # Whitelist and blacklist --------------------------------------------------
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "COL")
    if (data.type == "SeqVarGDSClass") {
      wl <- bl <- extract_markers_metadata(gds = data, whitelist = TRUE)

      # check for presence of COL info...
      if (!rlang::has_name(wl, "COL")) {
        message("COL info required, returning data")
        return(data)
      }
      if (!is.numeric(wl$COL)) wl$COL <- as.numeric(wl$COL)
    } else {
      wl <- bl <- dplyr::select(data, tidyselect::any_of(want))
    }

    # Check that required info is present in data: snp and locus----------------
    locus.check <- tibble::has_name(wl, "LOCUS")
    snp.check <- tibble::has_name(wl, "POS")
    col.check <- tibble::has_name(wl, "COL")
    if (!locus.check || !snp.check || !col.check) {
      problem.data <- "This filter requires a dataset with SNP (POS), LOCUS and COL informations"
      message("\n\n", problem.data)
      readr::write_lines(
        x = problem.data,
        file = file.path(path.folder, "README"))
      return(data)
    }


    # Generate snp per locus stats----------------------------------------------
    if (verbose) message("Generating SNP position on read stats")
    stats <- tibble_stats(
      x = wl %>%
        dplyr::distinct(MARKERS,COL) %$% COL,# %>% as.numeric(.)
      group = "snp position on read")

    snp.col.iqr.threshold <- c(stats$Q25, stats$Q75)

    # Generate box plot --------------------------------------------------------
    read.length <- max(wl$COL)

    fig <- boxplot_stats(
      data = stats,
      title =  "SNP position on the read",
      subtitle = if (!is.null(read.length)) {
        stringi::stri_join("Read length (max): ", read.length, " bp", "\nOutlier: ", ceiling(stats[[9]]))
      } else {
        stringi::stri_join("\nOutlier: ", ceiling(stats[[9]]))
      },
      x.axis.title = NULL,
      y.axis.title = "SNP position on the read (bp)",
      bp.filename = stringi::stri_join("snp.position.read.boxplot_", file.date, ".pdf"),
      path.folder = path.folder)

    # Distribution -------------------------------------------------------------
    d.plot <- wl %>%
      dplyr::distinct(MARKERS, COL)

    col.levels <- sort(unique(wl$COL))

    d.plot %<>% dplyr::mutate(COL = factor(x = COL, levels = col.levels))

    d.plot <- ggplot2::ggplot(
      data = d.plot, ggplot2::aes(factor(sort(as.integer(COL))))) +
      ggplot2::geom_bar() +
      ggplot2::labs(y = "Number of SNPs", x = "SNP position on the read (bp)") +
      ggplot2::theme_bw()+
      ggplot2::theme(
        axis.title.x = ggplot2::element_text(size = 12, face = "bold"),
        axis.title.y = ggplot2::element_text(size = 12, face = "bold"),
        legend.title = ggplot2::element_text(size = 12, face = "bold"),
        legend.text = ggplot2::element_text(size = 12, face = "bold"),
        strip.text.x = ggplot2::element_text(size = 12, face = "bold")
        # axis.text.x = ggplot2::element_text(size = 12, angle = 90, hjust = 1, vjust = 0.5)
      )
    # print(d.plot)

    # save
    d.plot.filename <- stringi::stri_join("snp.position.read.distribution_", file.date, ".pdf")

    ggplot2::ggsave(
      filename = file.path(path.folder, d.plot.filename),
      plot = d.plot,
      width = read.length, height = 10, dpi = 300, units = "cm",
      useDingbats = FALSE,
      limitsize = FALSE)

    # Helper table -------------------------------------------------------------
    if (verbose) message("Generating helper table...")
    n.markers <- nrow(wl)
    helper.table <- tibble::tibble(
      STATS = c("outliers", "q75", "iqr"),
      WHITELISTED_MARKERS = c(
        nrow(dplyr::filter(wl, COL <= stats[[9]])),
        nrow(dplyr::filter(wl, COL <= stats[[5]])),
        nrow(dplyr::filter(wl, COL >= stats[[3]] & COL <= stats[[5]])))
    ) %>%
      dplyr::mutate(
        BLACKLISTED_MARKERS = n.markers - WHITELISTED_MARKERS,
        STATS = factor(x = STATS, levels = c("outliers", "q75", "iqr"), ordered = FALSE)) %>%
      dplyr::arrange(STATS)

    readr::write_tsv(
      x = helper.table,
      file = file.path(path.folder, "snp.position.read.helper.table.tsv"))

    # figures
    markers.plot <- ggplot2::ggplot(
      data = tidyr::pivot_longer(
        data = helper.table,
        cols = -STATS,
        names_to = "LIST",
        values_to = "MARKERS"
      ),
      ggplot2::aes(x = STATS, y = MARKERS)) +
      # ggplot2::geom_line() +
      ggplot2::geom_point(size = 2, shape = 21, fill = "white") +
      ggplot2::scale_x_discrete(name = "SNP position on the read") +
      ggplot2::scale_y_continuous(name = "Number of markers") +
      ggplot2::theme_bw()+
      ggplot2::theme(
        axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
        axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
        axis.text.x = ggplot2::element_text(size = 8)#, angle = 90, hjust = 1, vjust = 0.5)
      ) +
      ggplot2::facet_grid(LIST ~. , scales = "free", space = "free")
    # print(markers.plot) # not that useful...

    print(d.plot) # more useful to leave at the end

    # save
    ggplot2::ggsave(
      filename = file.path(path.folder, "snp.position.read.helper.plot.pdf"),
      plot = markers.plot,
      width = 20,
      height = 15,
      dpi = 300,
      units = "cm",
      useDingbats = FALSE,
      limitsize = FALSE)
    helper.table <- markers.plot <- NULL
    if (verbose) message("Files written: helper tables and plots")

    # Step 2. Thresholds selection ---------------------------------------------
    if (interactive.filter) {
      message("\nStep 2. Filtering markers based on the SNPs position on the read\n")
      filter.snp.position.read <- radiator_question(
        x = "Choice of stats are: \n1: all (filter off)\n2: outliers\n3: q75\n4: iqr\n5: choose your own min and max values",
        answer.opt = c("1", "2", "3", "4", "5"))
      filter.snp.position.read <- stringi::stri_replace_all_fixed(
        str = filter.snp.position.read,
        pattern = c("1", "2", "3", "4", "5"),
        replacement = c("all", "outliers", "q75", "iqr", "thresholds"),
        vectorize_all = FALSE)
      if (filter.snp.position.read == "thresholds") {
        filter.snp.position.read[1] <- radiator_question(
          x = "Enter the min position of SNP on the read:",
          minmax = c(1,10000))
        filter.snp.position.read[2] <- radiator_question(
          x = "Enter the max position of SNP on the read:",
          minmax = c(1,10000))
      }
    }

    # filter.snp.position.read <- match.arg(
    #   filter.snp.position.read,
    #   choices = c("outliers", "q75", "iqr", "all"),
    #   several.ok = FALSE)

    # readr::write_tsv(x = stats, file = "testing.stats.tsv")

    # Filtering ----------------------------------------------------------------
    # if (filter.snp.position.read == "all") not necessary wl already exists...

    if (length(filter.snp.position.read) == 2) {
      wl %<>% dplyr::filter(COL >= as.numeric(filter.snp.position.read[1]) & COL <= as.numeric(filter.snp.position.read[2]))
    } else {
      if (filter.snp.position.read == "outliers") {
        wl %<>% dplyr::filter(COL <= stats[[9]])
      }
      if (filter.snp.position.read == "q75") {
        wl %<>% dplyr::filter(COL <= stats[[5]])
      }
      if (filter.snp.position.read == "iqr") {
        wl %<>% dplyr::filter(COL >= stats[[3]] & COL <= stats[[5]])
      }
    }

    # Whitelist and Blacklist of markers
    readr::write_tsv(
      x = wl,
      file = file.path(path.folder, "whitelist.markers.snp.position.read.tsv"),
      append = FALSE, col_names = TRUE)

    bl %<>% dplyr::filter(!MARKERS %in% wl$MARKERS) %>%
      dplyr::mutate(FILTERS = "filter.snp.position.read")
    # dplyr::setdiff(wl) %>% # crash RStudio...

    if (nrow(bl) > 0) {
      readr::write_tsv(
        x = bl,
        file = file.path(path.folder, "blacklist.markers.snp.position.read.tsv"),
        append = FALSE, col_names = TRUE)
    }

    # saving whitelist and blacklist
    if (verbose) message("File written: whitelist.markers.snp.position.read.tsv")
    if (verbose) message("File written: blacklist.markers.snp.position.read.tsv")

    if (data.type == "SeqVarGDSClass") {
      markers.meta <- extract_markers_metadata(gds = data, whitelist = FALSE) %>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            VARIANT_ID %in% bl$VARIANT_ID, "filter.snp.position.read", FILTERS
          )
        )

      update_radiator_gds(
        gds = data,
        node.name = "markers.meta",
        value = markers.meta,
        sync = TRUE
      )

      # update blacklist.markers
      # if (nrow(bl) > 0) {
      #   bl %<>% dplyr::select(MARKERS) %>%
      #     dplyr::mutate(FILTER = "filter.snp.position.read")
      #   bl.gds <- update_bl_markers(gds = data, update = bl)
      # }

    } else {
      # Apply the filter to the tidy data
      data  %<>% dplyr::filter(MARKERS %in% wl$MARKERS)
    }

    # radiator_parameters------------------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = FALSE,
      initiate = FALSE,
      update = TRUE,
      parameter.obj = filters.parameters,
      data = data,
      filter.name = "Filter SNPs position on the read",
      param.name = "filter.snp.position.read",
      values = filter.snp.position.read,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # results ------------------------------------------------------------------
    radiator_results_message(
      rad.message = stringi::stri_join("\nFilter SNP position on the read : ",
                                       filter.snp.position.read),
      filters.parameters,
      internal,
      verbose
    )
  }
  return(data)
} #End filter_snp_position_read
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.