################################################################################
#' @title Imputation, quality control and subset for a "bigSNP".
#' @name impute-qc-sub
#' @inheritParams bigsnpr-package
#' @return A new `bigSNP`.
#' @seealso [bigSNP][bigSNP-class]
#' @example examples/example.sub.bigSNP.R
NULL
################################################################################
#' @description `sub.bigSNP`: a function
#' to get a subset of an object of class `bigSNP`.
#' @param ind.row Indices of the rows (individuals) to keep.
#' Negative indices can be used to exclude row indices.
#' Default: keep them all.
#' @param ind.col Indices of the columns (SNPs) to keep.
#' Negative indices can be used to exclude column indices.
#' Default: keep them all.
#' @param backed Should the new `bigSNP` be filebacked? Default is `TRUE`.
#' @param shared Should the new genotype matrix be shared? Default is `TRUE`.
#' @export
#' @name sub.bigSNP
#' @rdname impute-qc-sub
sub.bigSNP <- function(x, ind.row = seq(nrow(x$genotypes)),
ind.col = seq(ncol(x$genotypes)),
backed = TRUE, shared = TRUE) {
check_x(x)
if (backed) {
newfile <- checkFile(x, "sub")
X2 <- deepcopy(x$genotypes,
rows = ind.row,
cols = ind.col,
# type = "char",
backingfile = paste0(newfile, ".bk"),
backingpath = x$backingpath,
descriptorfile = paste0(newfile, ".desc"))
snp_list <- list(genotypes = X2,
fam = x$fam[ind.row, ],
map = x$map[ind.col, ],
backingfile = newfile,
backingpath = x$backingpath)
class(snp_list) <- "bigSNP"
saveRDS(snp_list, file.path(x$backingpath, paste0(newfile, ".rds")))
} else {
X2 <- deepcopy(x$genotypes,
rows = ind.row,
cols = ind.col,
# type = "char",
shared = shared)
snp_list <- list(genotypes = X2,
fam = x$fam[ind.row, ],
map = x$map[ind.col, ],
backingfile = NULL,
backingpath = NULL)
class(snp_list) <- "bigSNP"
}
return(snp_list)
}
################################################################################
#' @name QC
#' @description `QC`: Quality control (filters)
#' for a `bigSNP` resulting in a `bigSNP` of lower dimension.
#' @param hwe.pval Level threshold (allowed type-I error) to test deviations
#' from Hardy–Weinberg equilibrium (HWE) from controls only. Default is `1e-6`.
#' @param rm.sex Keep only SNPs on the first 22 chrosmosomes? Default is `FALSE`.
#' @param row.cr.min Minimun individuals' call rate that is allowed.
#' Default is 95\%.
#' @param col.cr.min Minimum SNPs' call rate that is allowed. Default is 95\%.
#' @param maf.min Minimum Minor Allele Frequency that is allowed.
#' Usually, `0.01` is used. Default only removes SNPs that have a zero MAF.
#' @rdname impute-qc-sub
#' @export
QC <- function(x, row.cr.min = 0.95,
col.cr.min = 0.95,
hwe.pval = 1e-6,
maf.min = NULL,
rm.sex = FALSE) {
check_x(x)
counts <- snp_counts(x)
### HWE
hwe.qc <- function(observed) {
n <- colSums(observed)
q <- (observed[1, ] + observed[2,] / 2) / n
p <- 1 - q
expected <- n * rbind(q^2, 2*p*q, p^2)
#X2 <- colSums((abs(observed - expected) - 0.5)^2 / expected)
X2 <- colSums((observed - expected)^2 / expected)
pX2 <- stats::pchisq(X2, 1, lower.tail = FALSE)
return(which(pX2 < hwe.pval))
}
ind.hwe.qc <- hwe.qc(counts$cols.controls) # only controls
### MAF
# controls + cases
observed <- counts$cols.controls + counts$cols.cases
n <- colSums(observed)
q <- (observed[1, ] + observed[2,] / 2) / n
maf <- pmin(q, 1 - q)
ind.maf.qc <- which(maf < maf.min | maf == 0)
### NA COL
n.all <- nrow(x$genotypes)
call.rate.col <- n / n.all
ind.cr.col.qc <- which(call.rate.col < col.cr.min)
### NOT AUTOSOMAL
if (rm.sex) {
ind.sex <- which(x$map$chromosome > 22)
} else {
ind.sex <- integer(0)
}
### NA ROW
m.all <- ncol(x$genotypes)
call.rate.row <- 1 - counts$rows / m.all
ind.cr.row.qc <- which(call.rate.row < row.cr.min)
### Regroup everything
ind.qc.col <- c(ind.hwe.qc, ind.maf.qc, ind.cr.col.qc, ind.sex)
ind.qc.row <- c(ind.cr.row.qc)
return(sub.bigSNP(x,
ind.row = `if`(length(ind.qc.row) > 0,
-ind.qc.row, seq(n.all)),
ind.col = `if`(length(ind.qc.col) > 0,
-ind.qc.col, seq(m.all))))
}
################################################################################
#'@description `Impute`: Imputation function
#'for a `bigSNP`.
#'@param verbose Print progress? Default is `FALSE`.
#'@export
#'@name Impute
#'@rdname impute-qc-sub
Impute <- function(x, ncores = 1, verbose = FALSE) {
check_x(x)
# get descriptors
X.desc <- describe(x$genotypes)
newfile <- checkFile(x, "impute")
X2 <- deepcopy(x$genotypes, type = "char",
backingfile = paste0(newfile, ".bk"),
backingpath = x$backingpath,
descriptorfile = paste0(newfile, ".desc"))
X2.desc <- describe(X2)
range.chr <- LimsChr(x)
# function that imputes one chromosome
if (is.seq <- (ncores == 1)) {
registerDoSEQ()
} else {
cl <- parallel::makeCluster(ncores, outfile = `if`(verbose, "", NULL))
doParallel::registerDoParallel(cl)
}
res <- foreach(ic = seq_len(nrow(range.chr))) %dopar% {
lims <- range.chr[ic, ]
if (verbose)
printf("Imputing chromosome %d with \"nearest neighbors\"...\n", lims[3])
X <- sub.big.matrix(X.desc, firstCol = lims[1], lastCol = lims[2],
backingpath = x$backingpath)
X2 <- sub.big.matrix(X2.desc, firstCol = lims[1], lastCol = lims[2],
backingpath = x$backingpath)
predictNA <- function(ind, ind2) {
tmp <- X[, ind]
indNA <- which(is.na(tmp[, ind2]))
if (length(indNA) > 0) {
for (i in indNA) {
tmpSum <- rowSums(sweep(tmp[, -ind2], 2, tmp[i, -ind2], '=='), na.rm = T)
k <- 6
cond <- T
while (cond) {
indNN <- which(tmpSum == k)
k <- k - 1
pred <- mean(tmp[indNN, ind2], na.rm = T)
cond <- is.na(pred)
}
X2[i, ind[ind2]] <- round(pred)
}
}
return(0)
}
m <- ncol(X)
opt.save <- options(bigmemory.typecast.warning = FALSE)
# first three columns
for (j in 1:3) {
predictNA(1:7, j)
}
# middle
for (j in 4:(m-3)) {
predictNA(j + -3:3, 4)
}
# last three columns
for (j in 5:7) {
predictNA(m + -6:0, j)
}
on.exit(options(opt.save), add = TRUE)
if (verbose)
printf("Done imputing chromosome %d.\n", lims[3])
return(0)
}
if (!is.seq) parallel::stopCluster(cl)
snp_list <- list(genotypes = X2,
fam = x$fam,
map = x$map,
backingfile = newfile,
backingpath = x$backingpath)
class(snp_list) <- "bigSNP"
saveRDS(snp_list, file.path(x$backingpath, paste0(newfile, ".rds")))
return(snp_list)
}
################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.