Nothing
#' Clump loci based on a Linkage Disequilibrium threshold
#'
#' This function uses clumping to remove SNPs at high LD. When used with its
#' default options, clumping based on MAF is similar to standard pruning (as
#' done by PLINK with "--indep-pairwise (size+1) 1 thr.r2", but it results in a
#' better spread of SNPs over the chromosome.
#'
#' Any missing values in the genotypes of a `gen_tibble` passed to
#' `loci_ld_clump` will cause an error. To deal with missingness, see
#' [gt_impute_simple()].
#'
#' @param .x a [`gen_tibble`] object
#' @param .col the column to be used when a tibble (or grouped tibble is passed
#' directly to the function). This defaults to "genotypes" and can only take
#' that value. There is no need for the user to set it, but it is included to
#' resolve certain tidyselect operations.
#' @param thr_r2 Threshold over the squared correlation between two SNPs.
#' Default is `0.2`.
#' @param S A vector of loci statistics which express the importance of each SNP
#' (the more important is the SNP, the greater should be the corresponding
#' statistic).\cr For example, if `S` follows the standard normal
#' distribution, and "important" means significantly different from 0, you
#' must use `abs(S)` instead.\cr
#' **If not specified, MAFs are computed and used.**
#' @param size For one SNP, window size around this SNP to compute correlations.
#' Default is `100 / thr.r2` for clumping (0.2 -> 500; 0.1 -> 1000; 0.5 ->
#' 200). If `use_positions = FALSE`, this is a window in number of SNPs,
#' otherwise it is a window in kb (genetic distance). Ideally, use positions,
#' as they provide a more sensible approach.
#' @param use_positions boolean, if TRUE (the default), `size` is in kb, if
#' FALSE size is the number of SNPs.
#' @param exclude Vector of SNP indices to exclude anyway. For example, can be
#' used to exclude long-range LD regions (see Price2008). Another use can be
#' for thresholding with respect to p-values associated with `S`.
#' @param n_cores number of cores to be used
#' @param return_id boolean on whether the id of SNPs to keep should be
#' returned. It defaults to FALSE, which returns a vector of booleans (TRUE or
#' FALSE)
#' @param ... currently not used.
#' @return a boolean vector indicating whether the SNP should be kept (if
#' 'return_id = FALSE', the default), else a vector of SNP indices to be kept
#' (if 'return_id = TRUE')
#' @rdname loci_ld_clump
#' @export
#' @examplesIf all(rlang::is_installed(c("RhpcBLASctl", "data.table")))
#' \dontshow{
#' data.table::setDTthreads(2)
#' RhpcBLASctl::blas_set_num_threads(2)
#' RhpcBLASctl::omp_set_num_threads(2)
#' }
#' example_gt <- load_example_gt("gen_tbl") %>% gt_impute_simple()
#'
#' # To return a boolean vector indicating whether the SNP should be kept
#' example_gt %>% loci_ld_clump()
#' # To return a vector of SNP indices to be kept
#' example_gt %>% loci_ld_clump(return_id = TRUE)
#'
loci_ld_clump <- function(.x, .col = "genotypes", ...) {
UseMethod("loci_ld_clump", .x)
}
#' @export
#' @rdname loci_ld_clump
loci_ld_clump.tbl_df <- function(.x, .col = "genotypes", ...) {
stopifnot_gen_tibble(.x)
# check n_cores are available
.col <- rlang::enquo(.col) %>%
rlang::quo_get_expr() %>%
rlang::as_string()
# confirm that .col is "genotypes"
if (.col != "genotypes") {
stop("loci_ld_clump only works with the genotypes column")
}
loci_ld_clump(.x$genotypes, ...)
}
#' @export
#' @rdname loci_ld_clump
loci_ld_clump.vctrs_bigSNP <- function(
.x,
.col = "genotypes",
S = NULL, # nolint
thr_r2 = 0.2,
size = 100 / thr_r2,
exclude = NULL,
use_positions = TRUE,
n_cores = 1,
return_id = FALSE,
...) {
rlang::check_dots_empty()
stopifnot_diploid(.x)
if (n_cores > 1) {
# Remove checking for two levels of parallelism
options(bigstatsr.check.parallel.blas = FALSE)
on.exit(options(bigstatsr.check.parallel.blas = TRUE), add = TRUE)
}
# check that the loci have not been resorted
# check that big_index in the loci table is an increasing sequence of indeces
if (is.unsorted(show_loci(.x)$big_index, strictly = TRUE)) {
stop(paste(
"Your loci have been resorted; first save the new file backed",
"matrix with `gt_update_baking_file()"
))
}
if (gt_has_imputed(.x) && gt_uses_imputed(.x) == FALSE) {
# not uses_imputed
gt_set_imputed(.x, set = TRUE)
on.exit(gt_set_imputed(.x, set = FALSE), add = TRUE)
}
is_loci_table_ordered(.x, error_on_false = TRUE)
# get the FBM
geno_fbm <- attr(.x, "bigsnp")$genotypes # nolint
# rows (individuals) that we want to use
if (use_positions) {
.positions <- rep(NA, nrow(attr(.x, "bigsnp")$map))
.positions[show_loci(.x)$big_index] <- show_loci(.x)$position
} else {
.positions <- NULL
}
# create a chromosome vector (fill gaps between bigsnpr and show_loci)
.chromosome <- rep(2147483647L, nrow(attr(.x, "bigsnp")$map))
.chromosome[show_loci(.x)$big_index] <- show_loci(.x)$chr_int
# now figure out if we have any snp which have already been removed
# those will go into `exclude`
loci_not_in_tibble <-
seq_len(nrow(attr(.x, "bigsnp")$map))[
!seq_len(nrow(attr(.x, "bigsnp")$map)) %in% .gt_bigsnp_cols(.x)
] # nolint
exclude <- c(loci_not_in_tibble, .gt_bigsnp_cols(.x)[exclude])
if (length(exclude) == 0) {
exclude <- NULL
}
# as long as we have more than one individual
snp_clump_ids <- bigsnpr::snp_clumping(
G = attr(.x, "bigsnp")$genotypes,
# infos.chr = show_loci(.x)$chr_int, #nolint start
# TEMP HACK using the info from the bigsnpr object
# infos.chr = cast_chromosome_to_int(attr(.x,"bigsnp")$map$chromosome), #nolint end
infos.chr = .chromosome,
ind.row = vctrs::vec_data(.x),
S = S, # nolint
thr.r2 = thr_r2,
infos.pos = .positions,
size = size,
exclude = exclude,
ncores = n_cores
)
to_keep_id <- match(snp_clump_ids, show_loci(.x)$big_index)
if (return_id) {
to_keep_id
} else {
keep_bool <- rep(FALSE, count_loci(.x))
keep_bool[to_keep_id] <- TRUE
keep_bool
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.