R/filter_individuals.R

Defines functions filter_individuals

Documented in filter_individuals

# Filter/blacklist individuals

#' @name filter_individuals

#' @title Filter individuals based on genotyping/missingness rate,
#' heterozygosity and total coverage

#' @description Remove individuals with bad QC based on:
#' \itemize{
#' \item missingness (genotyping rate)
#' \item heterozygosity
#' \item coverage (total, median, iqr)
#' }
#'
#' \strong{Filter targets}: Individuals
#'
#' \strong{Statistics}: Missingness, heterozygosity and coverage
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users who wants to blacklist individuals.

#' @param data (2 options) A Genomic Data Structure (GDS)
#' file or object generated by radiator.
#'
#' \emph{How to get GDS?}
#' Look into:
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.


#' @param filter.individuals.missing (optional, double) A proportion above which the individuals are
#' blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.missing = NULL}.

#' @param filter.individuals.heterozygosity (optional, string of doubles) A proportion below and
#' above which the individuals are blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.heterozygosity = NULL}.

#' @param filter.individuals.coverage.total (optional, string of doubles)
#' Target the total coverage per samples.
#' A proportion below and
#' above which the individuals are blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.coverage.total = NULL}.

#' @param filter.individuals.coverage.median (optional, string of integers)
#' Target the median coverage per samples.
#' Integers, below and above, that blacklist individuals (removed from the dataset)
#' Default: \code{filter.individuals.coverage.median = NULL}.

#' @param filter.individuals.coverage.iqr (optional, string of integers)
#' Target the IQR (Interquartile Range) coverage per samples.
#' Integers, below and above, that blacklist individuals (removed from the dataset)
#' Default: \code{filter.individuals.coverage.iqr = NULL}.

#' @inheritParams radiator_common_arguments

#' @return A list with the filtered input and blacklist of individuals.

#' @export
#' @rdname filter_individuals

#' @seealso
#' \code{\link{filter_rad}}
#' \code{\link{tidy_genomic_data}}, \code{\link{read_vcf}},
#' \code{\link{tidy_vcf}}.

#' @examples
#' \dontrun{
#' require(SeqArray)
#'
#' # blacklisting outliers individuals:
#' id.qc <- radiator::filter_individuals(
#'     data = "my.radiator.gds.rad",
#'     filter.individuals.missing = "outliers",
#'     filter.individuals.heterozygosity = "outliers",
#'     filter.individuals.coverage.total = "outliers")
#'
#' # using values to blacklist individuals:
#' id.qc <- radiator::filter_individuals(
#'     data = "my.radiator.gds.rad",
#'     filter.individuals.missing = 0.5,
#'     filter.individuals.heterozygosity = c(0.02, 0.03),
#'     filter.individuals.coverage.total = c(900000, 5000000))
#'
#' }

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


filter_individuals <- function(
  data,
  interactive.filter = TRUE,
  filter.individuals.missing = NULL,
  filter.individuals.heterozygosity = NULL,
  filter.individuals.coverage.total = NULL,
  filter.individuals.coverage.median = NULL,
  filter.individuals.coverage.iqr = NULL,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
) {

  ##TESTS
  # path.folder <- NULL
  # parameters <- NULL
  # id.stats <- NULL
  # internal <- FALSE
  # subsample <- NULL
  # subsample.markers.stats <- NULL
  # interactive.filter = TRUE
  # filter.individuals.missing = NULL
  # filter.individuals.heterozygosity = NULL
  # filter.individuals.coverage.total = NULL
  # filter.individuals.coverage.median = NULL
  # filter.individuals.coverage.iqr = NULL
  # parallel.core = parallel::detectCores() - 1
  # verbose = TRUE


  if (interactive.filter ||
      !is.null(filter.individuals.missing) ||
      !is.null(filter.individuals.heterozygosity) ||
      !is.null(filter.individuals.coverage.total) ||
      !is.null(filter.individuals.coverage.median) ||
      !is.null(filter.individuals.coverage.iqr)
  ) {
    if (interactive.filter) verbose <- TRUE
    radiator_function_header(f.name = "filter_individuals", verbose = verbose)

    if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1

    # Cleanup---------------------------------------------------------------------
    file.date <- format(Sys.time(), "%Y%m%d@%H%M")
    if (verbose) message("Execution date@time: ", file.date)
    old.dir <- getwd()
    opt.change <- getOption("width")
    options(width = 70)
    timing <- radiator_tic()
    #back to the original directory and options
    on.exit(setwd(old.dir), add = TRUE)
    on.exit(options(width = opt.change), add = TRUE)
    on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
    on.exit(radiator_function_header(f.name = "filter_individuals", start = FALSE, verbose = verbose), add = TRUE)
    # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))

    # Function call and dotslist -------------------------------------------------
    rad.dots <- radiator_dots(
      func.name = as.list(sys.call())[[1]],
      fd = rlang::fn_fmls_names(),
      args.list = as.list(environment()),
      dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
      keepers = c("path.folder", "parameters", "internal", "id.stats",
                  "subsample", "subsample.markers.stats"),
      verbose = FALSE
    )

    # Checking for missing and/or default arguments ----------------------------
    if (missing(data)) rlang::abort("data is missing")

    # Folders---------------------------------------------------------------------
    path.folder <- generate_folder(
      f = path.folder,
      rad.folder = "filter_individuals",
      internal = internal,
      file.date = file.date,
      verbose = verbose)

    # write the dots file
    write_rad(
      data = rad.dots,
      path = path.folder,
      filename = stringi::stri_join("radiator_filter_individuals_args_", file.date, ".tsv"),
      tsv = TRUE,
      internal = internal,
      write.message = "Function call and arguments stored in: ",
      verbose = verbose
    )

    # Message about steps taken during the process -----------------------------
    if (interactive.filter) {
      message("Interactive mode: on\n")
      message("Step 1. Visualization")
      message("Step 2. Missingness")
      message("Step 3. Heterozygosity")
      message("Step 4. Coverage (if available)\n\n")
    }

    # Detect format ------------------------------------------------------------
    data.type <- radiator::detect_genomic_format(data)

    # Import data --------------------------------------------------------------
    if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
      rlang::abort("Input not supported for this function: read function documentation")
    }
    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

    # Filter parameter file: generate and initiate -----------------------------
    filters.parameters <- radiator_parameters(
      generate = TRUE,
      initiate = TRUE,
      update = FALSE,
      parameter.obj = parameters,
      data = data,
      path.folder = path.folder,
      file.date = file.date,
      internal = internal,
      verbose = verbose)

    # stats  -------------------------------------------------------------------
    filter.monomorphic <- FALSE

    # Step 1. Visuals ----------------------------------------------------------
    if (interactive.filter) {
      message("\nStep 1. Visualization of samples QC\n")
    }

    # Note to myself: for now it's better not to subsample, because of filtering
    variant.select <- subsample <- NULL
    subsample.markers.stats <- 1

    id.stats <- generate_stats(
      gds = data,
      individuals = TRUE,
      markers = FALSE,
      exhaustive = FALSE,
      subsample = variant.select,
      path.folder = path.folder,
      file.date = file.date,
      parallel.core = parallel.core,
      verbose = TRUE
    )
    print(id.stats$i.fig)


    # Step 2. Missingness-----------------------------------------------------------------
    if (interactive.filter) {
      message("\nStep 2. Filtering markers based individual missingness/genotyping\n")

      filter.individuals.missing <- radiator_question(
        x = "Do you want to blacklist samples based on missingness ? (y/n):",
        answer.opt = c("y", "n"))

      if (filter.individuals.missing == "y") {
        outlier.stats <- radiator_question(
          x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
        if (outlier.stats == 1) {
          filter.individuals.missing <- "outliers"
        } else {
          filter.individuals.missing <- radiator_question(
            x = "Enter the proportion threshold (0-1)
The maximum amount of missingness you tolerate for a sample (e.g. 0.3): ", minmax = c(0, 1))
        }
        outlier.stats <- NULL
      } else {
        filter.individuals.missing <- NULL
      }
    }
    if (!is.null(filter.individuals.missing)) {
      if (!purrr::is_double(filter.individuals.missing)) {
        # if max and outliers high are the same leave the value
        # if max and outliers high are not the same do gymnastic
        if (id.stats$i.stats$OUTLIERS_HIGH[1] == id.stats$i.stats$MAX[1]) {
          higher.eq <- FALSE
        } else {
          higher.eq <- TRUE
        }
        outlier.id.missing <- id.stats$i.stats$OUTLIERS_HIGH[1]
        if (verbose) message("\nRemoving outliers individuals based on genotyping statistics: ", outlier.id.missing)
        filter.individuals.missing <- outlier.id.missing
      } else {
        message("\nRemoving individuals based on genotyping statistics: ", filter.individuals.missing)
        higher.eq <- FALSE
      }

      bl <- dplyr::ungroup(id.stats$i.info)

      if (higher.eq) {
        bl %<>% dplyr::filter(MISSING_PROP >= filter.individuals.missing)
      } else {
        bl %<>% dplyr::filter(MISSING_PROP > filter.individuals.missing)
      }
      bl %<>%
        dplyr::distinct(INDIVIDUALS, .keep_all = TRUE) %>%
        dplyr::mutate(FILTER = "filter.individuals.missing")
      n.bl <- nrow(bl)

      individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            INDIVIDUALS %in% bl$INDIVIDUALS,
            "filter.individuals.missing", FILTERS
          )
        )
      update_radiator_gds(
        gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)

      if (n.bl > 0) {
        filter.monomorphic <- TRUE
        bl.filename <- stringi::stri_join("blacklist.individuals.missing_", file.date, ".tsv")
        readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
      }

      # Filter parameter file: update
      filters.parameters <- radiator_parameters(
        generate = FALSE,
        initiate = FALSE,
        update = TRUE,
        parameter.obj = filters.parameters,
        data = data,
        filter.name = "Filter individuals based on missingness (with outlier stats or values)",
        param.name = "filter.individuals.missing",
        values = filter.individuals.missing,
        path.folder = path.folder,
        file.date = file.date,
        internal = internal,
        verbose = verbose)

      # results ------------------------------------------------------------------
      radiator_results_message(
        rad.message = stringi::stri_join("\nFilter individuals based on missingness: ",
                                         filter.individuals.missing),
        filters.parameters,
        internal,
        verbose
      )
    }#End filter.individuals.missing

    # Step 3. Heterozygosity------------------------------------------------------------
    if (interactive.filter) {
      message("\nStep 3. Filtering markers based on individual heterozygosity\n")

      filter.individuals.heterozygosity <- radiator_question(
        x = "Do you want to blacklist samples based on heterozygosity ? (y/n):",
        answer.opt = c("y", "n"))

      if (filter.individuals.heterozygosity == "y") {
        outlier.stats <- radiator_question(
          x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
        if (outlier.stats == 1) {
          filter.individuals.heterozygosity <- "outliers"
        } else {
          filter.individuals.heterozygosity <- c(0,1)
          filter.individuals.heterozygosity[1] <- radiator_question(
            x = "Enter the min proportion threshold (0-1)
The minimum amount of heterozygosity you tolerate for a sample:", minmax = c(0, 1))
          filter.individuals.heterozygosity[2] <- radiator_question(
            x = "Enter the max proportion threshold (0-1)
The maximum amount of heterozygosity you tolerate for a sample:", minmax = c(0, 1))
        }
        outlier.stats <- NULL
      } else {
        filter.individuals.heterozygosity <- NULL
      }
    }
    if (!is.null(filter.individuals.heterozygosity)) {
      if (length(filter.individuals.heterozygosity) > 1) {
        het.low <- filter.individuals.heterozygosity[1]
        het.high <- filter.individuals.heterozygosity[2]
        if (verbose) message("\nRemoving individuals based on heterozygosity statistics: ", het.low, " / ", het.high)
        higher.eq <- FALSE
        lower.eq <- FALSE
      } else {
        if (is.character(filter.individuals.heterozygosity)) {
          if (id.stats$i.stats$OUTLIERS_LOW[2] == id.stats$i.stats$MIN[2]) {
            lower.eq <- FALSE
          } else {
            lower.eq <- TRUE
          }
          if (id.stats$i.stats$OUTLIERS_HIGH[2] == id.stats$i.stats$MAX[2]) {
            higher.eq <- FALSE
          } else {
            higher.eq <- TRUE
          }
          het.low <- id.stats$i.stats$OUTLIERS_LOW[2]
          het.high <- id.stats$i.stats$OUTLIERS_HIGH[2]
          if (verbose) message("\nRemoving outliers individuals based on heterozygosity statistics: ", het.low, " / ", het.high)
        } else {
          rlang::abort("Unknown filter.individuals.heterozygosity thresholds used")
        }
      }

      bl <- dplyr::ungroup(id.stats$i.info)

      # fl & fh for filter high and low
      if (lower.eq) {
        fl <- "HETEROZYGOSITY <= het.low"
      } else {
        fl <- "HETEROZYGOSITY < het.low"
      }
      if (higher.eq) {
        fh <- "HETEROZYGOSITY >= het.high"
      } else {
        fh <- "HETEROZYGOSITY > het.high"
      }

      filter.het <- stringi::stri_join(fh, fl, sep = " | ")
      bl %<>%
        dplyr::filter(!!rlang::parse_expr(filter.het)) %>%
        dplyr::distinct(INDIVIDUALS) %>%
        dplyr::mutate(FILTER = "filter.individuals.heterozygosity")
      n.bl <- nrow(bl)

      individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
        dplyr::mutate(
          FILTERS = dplyr::if_else(
            INDIVIDUALS %in% bl$INDIVIDUALS,
            "filter.individuals.heterozygosity", FILTERS
          )
        )
      update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)

      if (n.bl > 0) {
        filter.monomorphic <- TRUE

        # if (verbose) message("    number of individuals blacklisted based on heterozygosity: ", n.bl)
        bl.filename <- stringi::stri_join("blacklist.individuals.heterozygosity_", file.date, ".tsv")
        readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
        # bl.i <- update_bl_individuals(gds = data, update = bl)
        id.stats$i.info  %<>%
          dplyr::filter(!INDIVIDUALS %in% bl$INDIVIDUALS)

        # update_radiator_gds(gds = data, node.name = "individuals.meta", value = id.stats$info, sync = TRUE)
      }
      # Filter parameter file: update
      filters.parameters <- radiator_parameters(
        generate = FALSE,
        initiate = FALSE,
        update = TRUE,
        parameter.obj = filters.parameters,
        data = data,
        filter.name = "Filter individuals based on heterozygosity (with outlier stats or values)",
        param.name = "filter.individuals.heterozygosity",
        values = paste(het.low, het.high, collapse = " / "),
        path.folder = path.folder,
        file.date = file.date,
        internal = internal,
        verbose = verbose)

      # results ------------------------------------------------------------------
      radiator_results_message(
        rad.message = stringi::stri_join("\nFilter individuals based on heterozygosity: ", paste(het.low, het.high, collapse = " / ")),
        filters.parameters,
        internal,
        verbose
      )
    }#End filter.individuals.heterozygosity

    # Step 4. Coverage ---------------------------------------------------------
    depth.info <- "total coverage" %in% id.stats$i.stats$GROUP
    if (depth.info) {
      if (interactive.filter) {

        message("\nStep 4. Filtering markers based on individual's coverage\n")

        # TOTAL COVERAGE -------------------------------------------------------
        filter.individuals.coverage.total <- radiator_question(
          x = "Do you want to blacklist samples based on TOTAL coverage ? (y/n):",
          answer.opt = c("y", "n"))

        if (filter.individuals.coverage.total == "y") {
          outlier.stats <- radiator_question(
            x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
          if (outlier.stats == 1) {
            filter.individuals.coverage.total <- "outliers"
          } else {
            filter.individuals.coverage.total <- c(0,10000000000000000000000)
            filter.individuals.coverage.total[1] <- radiator_question(
              x = "Enter the minimum TOTAL coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
            )
            filter.individuals.coverage.total[2] <- radiator_question(
              x = "Enter the maximum TOTAL coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
              )
          }
          outlier.stats <- NULL
        } else {# if no...
          filter.individuals.coverage.total <- NULL
        }

        # MEDIAN COVERAGE  -----------------------------------------------------
        filter.individuals.coverage.median <- radiator_question(
          x = "Do you want to blacklist samples based on MEDIAN coverage ? (y/n):",
          answer.opt = c("y", "n")
        )

        if (filter.individuals.coverage.median == "y") {
          outlier.stats <- radiator_question(
            x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
          if (outlier.stats == 1) {
            filter.individuals.coverage.median <- "outliers"
          } else {
            filter.individuals.coverage.median <- c(0,10000000000000000000000)
            filter.individuals.coverage.median[1] <- radiator_question(
              x = "Enter the minimum MEDIAN coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
              )
            filter.individuals.coverage.median[2] <- radiator_question(
              x = "Enter the maximum MEDIAN coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
              )
          }
          outlier.stats <- NULL
        } else {# if no...
          filter.individuals.coverage.median <- NULL
        }


        # IQR COVERAGE ---------------------------------------------------------
        filter.individuals.coverage.iqr <- radiator_question(
          x = "Do you want to blacklist samples based on Interquartile Range (IQR) coverage ? (y/n):",
          answer.opt = c("y", "n")
        )

        if (filter.individuals.coverage.iqr == "y") {
          outlier.stats <- radiator_question(
            x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
          if (outlier.stats == 1) {
            filter.individuals.coverage.iqr <- "outliers"
          } else {
            filter.individuals.coverage.iqr <- c(0,10000000000000000000000)
            filter.individuals.coverage.iqr[1] <- radiator_question(
              x = "Enter the minimum IQR coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
            )
            filter.individuals.coverage.iqr[2] <- radiator_question(
              x = "Enter the maximum IQR coverage threshold you tolerate for a sample:",
              minmax = c(0, 10000000000000000000000)
            )
          }
          outlier.stats <- NULL
        } else {# if no...
          filter.individuals.coverage.iqr <- NULL
        }

      }#End interactive mode


      # filtering done here...
      # TOTAL COVERAGE -------------------------------------------------------
      if (!is.null(filter.individuals.coverage.total)) {
        if (length(filter.individuals.coverage.total) > 1) {
          cov.low <- filter.individuals.coverage.total[1]
          cov.high <- filter.individuals.coverage.total[2]
          if (verbose) message("\nRemoving individuals based on total coverage statistics: ", cov.low, " / ", cov.high)
          lower.eq <- higher.eq <- FALSE
        } else {
          if (is.character(filter.individuals.coverage.total)) {
            if (id.stats$i.stats$OUTLIERS_LOW[3] == id.stats$i.stats$MIN[3]) {
              lower.eq <- FALSE
            } else {
              lower.eq <- TRUE
            }
            if (id.stats$i.stats$OUTLIERS_HIGH[3] == id.stats$i.stats$MAX[3]) {
              higher.eq <- FALSE
            } else {
              higher.eq <- TRUE
            }
            cov.low <- id.stats$i.stats$OUTLIERS_LOW[3]
            cov.high <- id.stats$i.stats$OUTLIERS_HIGH[3]
            if (verbose) message("\nRemoving outliers individuals based on total coverage statistics: ", cov.low, " / ", cov.high)
            } else {
            rlang::abort("Unknown TOTAL coverage thresholds used")
          }
        }

        bl <- dplyr::ungroup(id.stats$i.info)

        # fl & fh for filter high and low
        if (lower.eq) {
          fl <- "COVERAGE_TOTAL <= cov.low"
        } else {
          fl <- "COVERAGE_TOTAL < cov.low"
        }
        if (higher.eq) {
          fh <- "COVERAGE_TOTAL >= cov.high"
        } else {
          fh <- "COVERAGE_TOTAL > cov.high"
        }

        filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
        bl %<>%
          dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
          dplyr::distinct(INDIVIDUALS) %>%
          dplyr::mutate(FILTER = "filter.individuals.coverage.total")
        n.bl <- nrow(bl)

        individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
          dplyr::mutate(
            FILTERS = dplyr::if_else(
              INDIVIDUALS %in% bl$INDIVIDUALS,
              "filter.individuals.coverage.total", FILTERS
            )
          )
        update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)

        if (n.bl > 0) {
          filter.monomorphic <- TRUE
          bl.filename <- stringi::stri_join("blacklist.individuals.coverate.total_", file.date, ".tsv")
          readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
        }
        # Filter parameter file: update
        filters.parameters <- radiator_parameters(
          generate = FALSE,
          initiate = FALSE,
          update = TRUE,
          parameter.obj = filters.parameters,
          data = data,
          filter.name = "Filter individuals based on total coverage (with outlier stats or values)",
          param.name = "filter.individuals.coverage.total",
          values = paste(cov.low, cov.high, collapse = " / "),
          path.folder = path.folder,
          file.date = file.date,
          internal = internal,
          verbose = verbose)

        # results ------------------------------------------------------------------
        radiator_results_message(
          rad.message = stringi::stri_join("\nFilter individuals based on total coverage: ", paste(cov.low, cov.high, collapse = " / ")),
          filters.parameters,
          internal,
          verbose
        )
      }#End coverage total

      # MEDIAN COVERAGE  -----------------------------------------------------
      if (!is.null(filter.individuals.coverage.median)) {
        if (length(filter.individuals.coverage.median) > 1) {
          cov.low <- filter.individuals.coverage.median[1]
          cov.high <- filter.individuals.coverage.median[2]
          if (verbose) message("\nRemoving individuals based on MEDIAN coverage statistics: ", cov.low, " / ", cov.high)
          lower.eq <- higher.eq <- FALSE
        } else {
          if (is.character(filter.individuals.coverage.median)) {
            if (id.stats$i.stats$OUTLIERS_LOW[5] == id.stats$i.stats$MIN[5]) {
              lower.eq <- FALSE
            } else {
              lower.eq <- TRUE
            }
            if (id.stats$i.stats$OUTLIERS_HIGH[5] == id.stats$i.stats$MAX[5]) {
              higher.eq <- FALSE
            } else {
              higher.eq <- TRUE
            }
            cov.low <- id.stats$i.stats$OUTLIERS_LOW[5]
            cov.high <- id.stats$i.stats$OUTLIERS_HIGH[5]
            if (verbose) message("\nRemoving outliers individuals based on MEDIAN coverage statistics: ", cov.low, " / ", cov.high)
          } else {
            rlang::abort("Unknown MEDIAN coverage thresholds used")
          }
        }

        bl <- dplyr::ungroup(id.stats$i.info)

        # fl & fh for filter high and low
        if (lower.eq) {
          fl <- "COVERAGE_MEDIAN <= cov.low"
        } else {
          fl <- "COVERAGE_MEDIAN < cov.low"
        }
        if (higher.eq) {
          fh <- "COVERAGE_MEDIAN >= cov.high"
        } else {
          fh <- "COVERAGE_MEDIAN > cov.high"
        }

        filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
        bl %<>%
          dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
          dplyr::distinct(INDIVIDUALS) %>%
          dplyr::mutate(FILTER = "filter.individuals.coverage.median")
        n.bl <- nrow(bl)

        individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
          dplyr::mutate(
            FILTERS = dplyr::if_else(
              INDIVIDUALS %in% bl$INDIVIDUALS,
              "filter.individuals.coverage.median", FILTERS
            )
          )
        update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)

        if (n.bl > 0) {
          filter.monomorphic <- TRUE
          bl.filename <- stringi::stri_join("blacklist.individuals.coverate.median_", file.date, ".tsv")
          readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
        }
        # Filter parameter file: update
        filters.parameters <- radiator_parameters(
          generate = FALSE,
          initiate = FALSE,
          update = TRUE,
          parameter.obj = filters.parameters,
          data = data,
          filter.name = "Filter individuals based on median coverage (with outlier stats or values)",
          param.name = "filter.individuals.coverage.median",
          values = paste(cov.low, cov.high, collapse = " / "),
          path.folder = path.folder,
          file.date = file.date,
          internal = internal,
          verbose = verbose)

        # results ------------------------------------------------------------------
        radiator_results_message(
          rad.message = stringi::stri_join("\nFilter individuals based on median coverage: ", paste(cov.low, cov.high, collapse = " / ")),
          filters.parameters,
          internal,
          verbose
        )
      }#End coverage median

      # IQR COVERAGE ---------------------------------------------------------
      if (!is.null(filter.individuals.coverage.iqr)) {
        if (length(filter.individuals.coverage.iqr) > 1) {
          cov.low <- filter.individuals.coverage.iqr[1]
          cov.high <- filter.individuals.coverage.iqr[2]
          if (verbose) message("\nRemoving individuals based on IQR coverage statistics: ", cov.low, " / ", cov.high)
          lower.eq <- higher.eq <- FALSE
        } else {
          if (is.character(filter.individuals.coverage.iqr)) {
            if (id.stats$i.stats$OUTLIERS_LOW[6] == id.stats$i.stats$MIN[6]) {
              lower.eq <- FALSE
            } else {
              lower.eq <- TRUE
            }
            if (id.stats$i.stats$OUTLIERS_HIGH[6] == id.stats$i.stats$MAX[6]) {
              higher.eq <- FALSE
            } else {
              higher.eq <- TRUE
            }
            cov.low <- id.stats$i.stats$OUTLIERS_LOW[6]
            cov.high <- id.stats$i.stats$OUTLIERS_HIGH[6]
            if (verbose) message("\nRemoving outliers individuals based on IQR coverage statistics: ", cov.low, " / ", cov.high)
          } else {
            rlang::abort("Unknown IQR coverage thresholds used")
          }
        }

        bl <- dplyr::ungroup(id.stats$i.info)

        # fl & fh for filter high and low
        if (lower.eq) {
          fl <- "COVERAGE_IQR <= cov.low"
        } else {
          fl <- "COVERAGE_IQR < cov.low"
        }
        if (higher.eq) {
          fh <- "COVERAGE_IQR >= cov.high"
        } else {
          fh <- "COVERAGE_IQR > cov.high"
        }

        filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
        bl %<>%
          dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
          dplyr::distinct(INDIVIDUALS) %>%
          dplyr::mutate(FILTER = "filter.individuals.coverage.iqr")
        n.bl <- nrow(bl)

        individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
          dplyr::mutate(
            FILTERS = dplyr::if_else(
              INDIVIDUALS %in% bl$INDIVIDUALS,
              "filter.individuals.coverage.iqr", FILTERS
            )
          )
        update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)

        if (n.bl > 0) {
          filter.monomorphic <- TRUE
          bl.filename <- stringi::stri_join("blacklist.individuals.coverate.iqr_", file.date, ".tsv")
          readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
        }
        # Filter parameter file: update
        filters.parameters <- radiator_parameters(
          generate = FALSE,
          initiate = FALSE,
          update = TRUE,
          parameter.obj = filters.parameters,
          data = data,
          filter.name = "Filter individuals based on iqr coverage (with outlier stats or values)",
          param.name = "filter.individuals.coverage.iqr",
          values = paste(cov.low, cov.high, collapse = " / "),
          path.folder = path.folder,
          file.date = file.date,
          internal = internal,
          verbose = verbose)

        # results ------------------------------------------------------------------
        radiator_results_message(
          rad.message = stringi::stri_join("\nFilter individuals based on IQR coverage: ", paste(cov.low, cov.high, collapse = " / ")),
          filters.parameters,
          internal,
          verbose
        )
      }#End coverage iqr


    }#End DP

    # MONOMORPHIC MARKERS --------------------------------------------------
    data <- filter_monomorphic(
      data = data,
      filter.monomorphic = filter.monomorphic,
      parallel.core = parallel.core,
      verbose = FALSE,
      parameters = filters.parameters,
      path.folder = path.folder,
      internal = FALSE)
  }#before this one
  return(data)
}#End filter_individuals
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.