R/filter_common_markers.R

Defines functions not_common_markers filter_common_markers

Documented in filter_common_markers not_common_markers

# Keep markers in common between all populations

#' @name filter_common_markers

#' @title Filter common markers between strata

#' @description The function will filter the markers by keeping only those
#' in common between all strata
#' (population or any groupings defined in \code{STRATA} column).
#'
#' \strong{Filter targets}: SNPs
#'
#' \strong{Statistics}: strata genotyping rate per SNPs
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users who wants to keep only markers in common.


#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#' @param filter.common.markers (optional, logical)
#' Default: \code{filter.common.markers = TRUE}.
#'
#' @param fig (optional, logical) \code{fig = TRUE} will produce an
#' \href{https://github.com/hms-dbmi/UpSetR}{UpSet fig} to visualize the number
#' of markers between populations. The package is required for this to work...
#' Default: \code{fig = FALSE}.

#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty
#' during execution.
#' Default: \code{verbose = FALSE}.
#' @param ... (optional) To pass further arguments for fine-tuning the function
#' and legacy arguments.
#' @inheritParams tidy_genomic_data

#' @return A list with the filtered input, whitelist and blacklist of markers..

#' @examples
#' \dontrun{
#' require(SeqArray) # when using gds
#' common <- radiator::filter_common_markers(data = "my.radiator.gds.rad", verbose = TRUE)
#' }

#' @export
#' @rdname filter_common_markers

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

filter_common_markers <- function(
  data,
  filter.common.markers = TRUE,
  fig = FALSE,
  parallel.core = parallel::detectCores() - 1,
  verbose = FALSE,
  ...
) {
  # Test
  # filter.common.markers = TRUE
  # fig = TRUE
  # parallel.core = parallel::detectCores() - 1
  # verbose = TRUE
  # path.folder <- NULL
  # parameters <- NULL
  # internal <- FALSE

  if (!filter.common.markers) {
    return(data)
  } else {

    # Cleanup-------------------------------------------------------------------
    radiator_function_header(f.name = "filter_common_markers", verbose = verbose)
    file.date <- format(Sys.time(), "%Y%m%d@%H%M")
    if (verbose) message("Execution date@time: ", file.date)
    old.dir <- getwd()
    opt.change <- getOption("width")
    options(width = 70)
    timing <- radiator_tic()
    #back to the original directory and options
    on.exit(setwd(old.dir), add = TRUE)
    on.exit(options(width = opt.change), add = TRUE)
    on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
    on.exit(radiator_function_header(f.name = "filter_common_markers", start = FALSE, verbose = verbose), add = TRUE)

    # Checking for missing and/or default arguments ------------------------------
    if (missing(data)) rlang::abort("Input file missing")

    # Function call and dotslist -------------------------------------------------
    rad.dots <- radiator_dots(
      func.name = as.list(sys.call())[[1]],
      fd = rlang::fn_fmls_names(),
      args.list = as.list(environment()),
      dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
      keepers = c("path.folder", "parameters", "internal"),
      verbose = FALSE
    )
    if (internal) fig <- FALSE
    if (fig) {
      if (!requireNamespace("UpSetR", quietly = TRUE)) {
        rlang::abort("UpSetR needed for this function to work
                   Install with install.packages('UpSetR')")
      }
    }
    # Folders---------------------------------------------------------------------
    path.folder <- generate_folder(
      f = path.folder,
      rad.folder = "filter_common_markers",
      internal = internal,
      file.date = file.date,
      verbose = verbose)

    # write the dots file
    write_rad(
      data = rad.dots,
      path = path.folder,
      filename = stringi::stri_join(
        "radiator_filter_common_markers_args_", file.date, ".tsv"),
      tsv = TRUE,
      internal = internal,
      write.message = "Function call and arguments stored in: ",
      verbose = verbose
    )

    # Detect format --------------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)
    if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
      rlang::abort("Input not supported for this function: read function documentation")
    }


    # GDS
    if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
      radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)


      if (data.type == "gds.file") {
        data <- radiator::read_rad(data, verbose = verbose)
        data.type <- "SeqVarGDSClass"
      }

      # Filter parameter file: generate and initiate -----------------------------
      filters.parameters <- radiator_parameters(
        generate = TRUE,
        initiate = TRUE,
        update = FALSE,
        parameter.obj = parameters,
        data = data,
        path.folder = path.folder,
        file.date = file.date,
        internal = internal,
        verbose = verbose)


      # Filtering common markers -----------------------------------------------
      if (verbose) message("Scanning for common markers...")
      n.markers.before <- filters.parameters$info$n.snp
      strata <- extract_individuals_metadata(
        gds = data,
        ind.field.select = c("STRATA", "INDIVIDUALS"),
        whitelist = TRUE)
      n.pop <- length(unique(strata$STRATA))

      if (n.pop == 1) {
        message("Filter common markers: only 1 strata, returning data")
        return(data)
      }
      check.strata <- strata %>% dplyr::count(STRATA) %>% dplyr::filter(n <= 1)
      if (nrow(check.strata) > 0) {
        message("\nStrata with low sample size detected: fig <- FALSE\n")
        fig <- FALSE
      }

      # plot_upset--------------------------------------------------------------
      if (fig) {
        plot.filename <- stringi::stri_join(
          "common.markers.upsetrplot_", file.date)
        plot.filename <- file.path(path.folder, plot.filename)
        plot_upset(x = data,
                   data.type = data.type,
                   plot.filename = plot.filename,
                   parallel.core = parallel.core,
                   verbose = verbose)
      }

      # while SeqArray bug is fixed, PLAN B below uses tidy data
      markers.meta <- extract_markers_metadata(gds = data, whitelist = FALSE)
      bl <- not_common_markers(
        x = data, strata = strata,
        parallel.core = 1 #parallel.core
      )

      n.markers.removed <- length(bl)
      want <- c("VARIANT_ID", "MARKERS", "CHROM", "LOCUS", "POS")
      if (n.markers.removed > 0) {
        n.markers.after <- n.markers.before - n.markers.removed
        markers.meta %<>%
          dplyr::mutate(
            FILTERS = dplyr::if_else(
              VARIANT_ID %in% bl, "filter.common.markers", FILTERS
            )
          )

        write_rad(
          data = markers.meta %>% dplyr::filter(FILTERS == "filter.common.markers"),
          path = path.folder,
          filename = stringi::stri_join("blacklist.not.common.markers_", file.date, ".tsv"),
          tsv = TRUE, internal = internal, verbose = verbose)

        # Update GDS
        update_radiator_gds(
          gds = data,
          node.name = "markers.meta",
          value = markers.meta,
          sync = TRUE
        )

      }
      write_rad(
        data = markers.meta %>% dplyr::filter(FILTERS == "whitelist"),
        path = path.folder,
        filename = stringi::stri_join("whitelist.common.markers_", file.date, ".tsv"),
        tsv = TRUE,
        internal = internal,
        verbose = verbose)

    } else {#Tidy data
      # Import data ---------------------------------------------------------------
      if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)

      # Keep whitelist and blacklist (same = same space used)
      wl <- radiator::separate_markers(data = data, sep = "__", markers.meta.lists.only = TRUE)

      data %<>% dplyr::left_join(wl, by = intersect(colnames(data), colnames(wl)))


      # Filter parameter file: generate and initiate ------------------------------------------
      filters.parameters <- radiator_parameters(
        generate = TRUE,
        initiate = TRUE,
        update = FALSE,
        parameter.obj = parameters,
        data = data,
        path.folder = path.folder,
        file.date = file.date,
        internal = internal,
        verbose = verbose)

      n.pop <- filters.parameters$info$n.pop
      if (fig && n.pop == 1) {
        message("\n\nNOTE: the plot argument requires more than 1 strata\n\n")
        fig <- FALSE
      }
      if (fig) {
        plot.filename <- stringi::stri_join(
          "common.markers.upsetrplot_", file.date)
        plot.filename <- file.path(path.folder, plot.filename)
        plot_upset(x = data,
                   data.type = data.type,
                   plot.filename = plot.filename,
                   parallel.core = parallel.core,
                   verbose = verbose)
      }

      if (verbose) message("Scanning for common markers...")

      data %<>% dplyr::rename(STRATA = tidyselect::any_of("POP_ID"))


      if (tibble::has_name(data, "GT_BIN")) {
        bl <- dplyr::select(.data = data, MARKERS, STRATA, GT_BIN) %>%
          dplyr::filter(!is.na(GT_BIN))
      } else {
        bl <- dplyr::select(.data = data, MARKERS, STRATA, GT) %>%
          dplyr::filter(GT != "000000")
      }

      bl <- dplyr::distinct(bl, MARKERS, STRATA) %>%
        dplyr::count(x = ., MARKERS) %>%
        dplyr::filter(n != length(unique(data$STRATA))) %>%
        dplyr::distinct(MARKERS) %>%
        dplyr::arrange(MARKERS)

      # Remove the markers from the dataset
      n.markers.removed <- nrow(bl)

      if (n.markers.removed > 0) {
        data <- dplyr::filter(data, !MARKERS %in% bl$MARKERS)
        bl <- wl %>% dplyr::filter(MARKERS %in% bl$MARKERS)
        write_rad(
          data = bl,
          path = path.folder,
          filename = stringi::stri_join("blacklist.not.common.markers_", file.date, ".tsv"),
          tsv = TRUE, internal = internal, verbose = verbose)

        wl %<>% dplyr::filter(!MARKERS %in% bl$MARKERS)
        write_rad(
          data = wl,
          path = path.folder,
          filename = stringi::stri_join("whitelist.common.markers_", file.date, ".tsv"),
          tsv = TRUE, internal = internal, verbose = verbose)
      } else {
        bl <- wl[0,]
      }
    }#End tidy

    # Filter parameter file: update --------------------------------------------
    filters.parameters <- radiator_parameters(
      generate = FALSE,
      initiate = FALSE,
      update = TRUE,
      parameter.obj = filters.parameters,
      data = data,
      filter.name = "Filter markers in common",
      param.name = "filter.common.markers",
      values = "",
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # Return -----------------------------------------------------------------------
    radiator_results_message(
      rad.message = "\nFilter common markers:",
      filters.parameters,
      internal,
      verbose
    )

    return(data)
  }
}#End filter_common_markers

# Internal functions -----------------------------------------------------------
# Generate a blacklist of markers not in common
#' @title not_common_markers
#' @description Generate a blacklist of markers not in common
#' @rdname not_common_markers
#' @keywords internal
#' @export
not_common_markers <- function(
  x,
  strata,
  parallel.core = parallel::detectCores() - 2
) {

  # PLAN B
  # n.pop <- length(unique(strata$STRATA))
  # bl <- extract_genotypes_metadata(
  #   gds = x,
  #   genotypes.meta.select = c("INDIVIDUALS", "VARIANT_ID", "GT_BIN")
  # ) %>%
  #   dplyr::filter(!is.na(GT_BIN)) %>%
  #   join_strata(data = ., strata = strata, verbose = FALSE) %>%
  #   dplyr::distinct(VARIANT_ID, STRATA) %>%
  #   dplyr::count(x = ., VARIANT_ID) %>%
  #   dplyr::filter(n != n.pop) %>%
  #   dplyr::distinct(VARIANT_ID) %>%
  #   dplyr::arrange(VARIANT_ID) %$% VARIANT_ID

  # PLAN A using seqarray
  # Get the sample from radiator node or gds -----------------------------------
  # Note to myself : you could get the info below from the strata
  sample.bk <- extract_individuals_metadata(
    gds = x,
    ind.field.select = "INDIVIDUALS",
    whitelist = TRUE
  ) %$% INDIVIDUALS

  not_common <- function(
    split.data = NULL,
    x = NULL,
    parallel.core = parallel::detectCores() - 2
  ) {
    parallel.core.opt <- parallel_core_opt(parallel.core)

    SeqArray::seqSetFilter(
      object = x,
      sample.id = split.data,
      action = "set",
      verbose = FALSE)

    bl <- SeqArray::seqGetData(
      gdsfile = x,
      var.name = "variant.id")[SeqArray::seqMissing(
        gdsfile = x,
        per.variant = TRUE,
        parallel = parallel.core.opt
      ) == 1]
    return(bl)
  }#End not_common

  bl <- dplyr::group_split(strata, STRATA, .keep = FALSE) %>%
    purrr::flatten(.) %>%
    purrr::map(.x = .,
               .f = not_common,
               x = x,
               parallel.core = parallel.core
    ) %>% unlist %>% unique %>% sort

  # reset
  # summary_gds(x)
  # SeqArray::seqSetFilter(x, action = "pop", verbose = TRUE)
  SeqArray::seqSetFilter(
    object = x,
    sample.id = sample.bk,
    action = "set",
    verbose = FALSE)
  # summary_gds(x)
  return(bl)
}#End not_common_markers

# Generate UPSETR plot----------------------------------------------------------
#' @title plot_upset
#' @description Generate UpSetR plot
#' @rdname plot_upset
#' @keywords internal
#' @export

plot_upset <- function(
  x,
  data.type,
  plot.filename = NULL,
  parallel.core = parallel::detectCores() - 2,
  verbose = FALSE
) {

  # For pc set the number of core to 1 -----------------------------------------
  parallel.core <- 1

  # if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1

  if (verbose) message("Generating UpSet plot to visualize markers in common")

  if (data.type == "SeqVarGDSClass") {
    # Get the sample from radiator node or gds ---------------------------------
    strata <- extract_individuals_metadata(
      gds = x,
      ind.field.select = c("STRATA", "INDIVIDUALS"),
      whitelist = TRUE
    )
    n.pop = length(unique(strata$STRATA))

    sample.bk <- strata$INDIVIDUALS
    missing_markers_pop <- function(
      strata.split,
      x,
      parallel.core = parallel::detectCores() - 2
    ) {
      # strata.split <- dplyr::group_split(.tbl = strata, STRATA)[[4]]
      parallel.core.opt <- parallel_core_opt(parallel.core)

      SeqArray::seqSetFilter(
        object = x,
        sample.id = strata.split$INDIVIDUALS,
        action = "set",
        verbose = FALSE
      )

      res <- tibble::tibble(
        STRATA = SeqArray::seqMissing(
          gdsfile = x,
          per.variant = TRUE,
          parallel = parallel.core.opt
        ) %>%
          magrittr::inset(. == 1L, 9L) %>%
          magrittr::inset(. < 1L, 1L) %>%
          magrittr::inset(. == 9L, 0L)
      ) %>%
        magrittr::set_colnames(., unique(strata.split$STRATA))
      return(res)
    }#End not_common

    plot.data <- dplyr::group_split(.tbl = strata, STRATA) %>%
      purrr::map_dfc(
        .x = .,
        .f = missing_markers_pop,
        x = x,
        parallel.core = parallel.core
      ) %>%
      data.frame(.)#UpSetR requires data.frame

    readr::write_tsv(x = plot.data, file = stringi::stri_join(plot.filename, ".tsv"))
    SeqArray::seqSetFilter(
      object = x,
      sample.id = sample.bk,
      action = "set",
      verbose = FALSE)
  } else {
    n.pop = length(unique(x$STRATA))
    if (tibble::has_name(x, "GT_BIN")) {
      plot.data <- dplyr::filter(x, !is.na(GT_BIN))
    } else {
      plot.data <- dplyr::filter(x, GT != "000000")
    }

    plot.data <- dplyr::distinct(plot.data, MARKERS, STRATA) %>%
      dplyr::mutate(
        n = rep(1, n()),
        STRATA = stringi::stri_join("POP_", STRATA)
      ) %>%
      tidyr::pivot_wider(data = ., names_from = "STRATA", values_from = "n", values_fill = 0) %>%
      data.frame(.)#UpSetR requires data.frame
  }

  # generate the plot
  # print(dev.list())

  # pdf(
  #   file = stringi::stri_join(plot.filename, ".pdf"),
  #   onefile = FALSE
  # )
  grDevices::png(filename = stringi::stri_join(plot.filename, ".png"))
  print(
    UpSetR::upset(
    data = plot.data,
    nsets = n.pop,
    order.by = "freq",
    empty.intersections = NULL)
  )
  dev.off()
  # print(dev.list())

}#End plot_upset
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.