bigsnpr: Analysis of Massive SNP Arrays

Documented in bed_autoSVD bed_randomSVD snp_autoSVD

################################################################################

# regroup consecutive integers in intervals
getIntervals <- function(x, n = 2) {

  le <- rle(diff(x))

  ind <- which((le$values == 1) & (le$lengths >= (n - 1)))
  pos <- cumsum(le$lengths) + 1

  cbind(x[c(1, pos)[ind]], x[pos[ind]])
}

################################################################################

#' Truncated SVD while limiting LD
#'
#' Fast truncated SVD with initial pruning and that iteratively removes
#' long-range LD regions. Some variants are removing due to the initial clumping,
#' then more and more variants are removed at each iteration. You can access the
#' indices of the remaining variants with `attr(*, "subset")`. If some of the
#' variants removed are contiguous, the regions are reported in `attr(*, "lrldr")`.
#'
#' If you don't have any information about SNPs, you can try using
#'   - `infos.chr = rep(1, ncol(G))`,
#'   - `size = ncol(G)` (if SNPs are not sorted),
#'   - `roll.size = 0` (if SNPs are not sorted).
#'
#' @inheritParams bigsnpr-package
#' @inheritParams snp_clumping
#' @param fun.scaling A function with parameters `X` (or `obj.bed`), `ind.row` and
#'   `ind.col`, and that returns a data.frame with `$center` and `$scale` for the
#'   columns corresponding to `ind.col`, to scale each of their elements such as followed:
#'   \deqn{\frac{X_{i,j} - center_j}{scale_j}.} Default uses binomial scaling.
#'   You can also provide your own `center` and `scale` by using [as_scaling_fun()].
#' @param k Number of singular vectors/values to compute. Default is `10`.
#'   **This algorithm should be used to compute a few singular vectors/values.**
#' @param roll.size Radius of rolling windows to smooth log-p-values.
#'   Default is `50`.
#' @param int.min.size Minimum number of consecutive outlier SNPs
#'   in order to be reported as long-range LD region. Default is `20`.
#' @param verbose Output some information on the iterations? Default is `TRUE`.
#' @param thr.r2 Threshold over the squared correlation between two SNPs.
#'   Default is `0.2`. Use `NA` if you want to skip the clumping step.
#' @param alpha.tukey Default is `0.1`. The type-I error rate in outlier
#'   detection (that is further corrected for multiple testing).
#' @param min.mac Minimum minor allele count (MAC) for variants to be included.
#'   Default is `10`.
#' @param max.iter Maximum number of iterations of outlier detection.
#'   Default is `5`.
#'
#' @inherit bigstatsr::big_randomSVD return
#' @export
#'
#' @examples
#' ex <- snp_attachExtdata()
#' G <- ex$genotypes
#'
#' obj.svd <- snp_autoSVD(G,
#'                        infos.chr = ex$map$chromosome,
#'                        infos.pos = ex$map$physical.position)
#'
#' str(obj.svd)
#'
snp_autoSVD <- function(G,
                        infos.chr,
                        infos.pos = NULL,
                        ind.row = rows_along(G),
                        ind.col = cols_along(G),
                        fun.scaling = snp_scaleBinom(),
                        thr.r2 = 0.2,
                        size = 100 / thr.r2,
                        k = 10,
                        roll.size = 50,
                        int.min.size = 20,
                        alpha.tukey = 0.05,
                        min.mac = 10,
                        max.iter = 5,
                        is.size.in.bp = NULL,
                        ncores = 1,
                        verbose = TRUE) {

  check_args()
  assert_lengths(infos.chr, cols_along(G))
  if (!is.null(infos.pos)) assert_lengths(infos.pos, cols_along(G))

  if (!missing(is.size.in.bp))
    warning2("Parameter 'is.size.in.bp' is deprecated.")

  # Verbose?
  printf2 <- function(...) if (verbose) printf(...)

  # First clumping
  if (is.na(thr.r2)) {
    printf2("\nSkipping clumping.\n")
    ind.keep <- ind.col
  } else {
    printf2("\nPhase of clumping (on MAF) at r^2 > %s.. ", thr.r2)
    ind.keep <- snp_clumping(G, infos.chr,
                             ind.row = ind.row,
                             exclude = setdiff(cols_along(G), ind.col),
                             thr.r2 = thr.r2,
                             size = size,
                             infos.pos = infos.pos,
                             ncores = ncores)
    printf2("keep %d SNPs.\n", length(ind.keep))
  }

  if (min.mac > 0) {
    maf <- snp_MAF(G, ind.row, ind.keep, ncores = ncores)
    min.maf <- min.mac / (2 * length(ind.row))
    mac.nok <- (maf < min.maf)
    printf2("Discarding %d variant%s with MAC < %s.\n", sum(mac.nok),
            `if`(sum(mac.nok) > 1, "s", ""), min.mac)
    ind.keep <- ind.keep[!mac.nok]
  }

  iter <- 0L
  LRLDR <- LD.wiki34[0, 1:3]
  repeat {
    printf2("\nIteration %d:\n", iter <- iter + 1L)
    printf2("Computing SVD..\n")
    # SVD
    obj.svd <- big_randomSVD(G,
                             fun.scaling = fun.scaling,
                             ind.row = ind.row,
                             ind.col = ind.keep,
                             k = k,
                             ncores = ncores)

    if (iter > max.iter) {
      printf2("Maximum number of iterations reached.\n")
      break
    }

    # check for outlier variants
    S.col <- sqrt(bigutilsr::dist_ogk(obj.svd$v))
    # roll mean to get only consecutive outliers (by chromosome)
    ind.split <- split(seq_along(S.col), infos.chr[ind.keep])
    S2.col <- double(length(S.col))
    for (ind in ind.split)
      S2.col[ind] <- bigutilsr::rollmean(S.col[ind], roll.size)
    S2.col.thr <- bigutilsr::tukey_mc_up(S2.col, alpha = alpha.tukey)
    ind.col.excl <- which(S2.col > S2.col.thr)
    printf2("%d outlier variant%s detected..\n", length(ind.col.excl),
            `if`(length(ind.col.excl) > 1, "s", ""))

    # Stop or continue?
    if (length(ind.col.excl) > 0) {

      ind.keep <- ind.keep[-ind.col.excl]

      # Detection of long-range LD regions
      if (!is.null(infos.pos)) {
        # Regroup them by intervals to return long-range LD regions
        ind.range <- getIntervals(ind.col.excl, n = int.min.size)
        printf2("%d long-range LD region%s detected..\n", nrow(ind.range),
                `if`(nrow(ind.range) > 1, "s", ""))
        for (i in rows_along(ind.range)) {
          seq.range <- seq2(ind.range[i, ])
          seq.range.chr <- infos.chr[ind.keep[seq.range]]
          chr <- stats::median(seq.range.chr)  ## to get mode
          in.chr <- (infos.chr[ind.keep[seq.range]] == chr)
          range.in.chr <- range(infos.pos[ind.keep[seq.range[in.chr]]])
          LRLDR[nrow(LRLDR) + 1L, ] <- c(chr, range.in.chr)
        }
      }
    } else {
      printf2("\nConverged!\n")
      break
    }
  }

  structure(
    obj.svd,
    subset = ind.keep,
    lrldr = LRLDR[with(LRLDR, order(Chr, Start, Stop)), ]
  )
}

################################################################################

#' Randomized partial SVD
#'
#' Partial SVD (or PCA) of a genotype matrix stored as a PLINK (.bed) file.#'
#'
#' @inheritParams bigsnpr-package
#' @inherit bigstatsr::big_randomSVD params return
#'
#' @export
#'
#' @examples
#' bedfile <- system.file("extdata", "example.bed", package = "bigsnpr")
#' obj.bed <- bed(bedfile)
#'
#' str(bed_randomSVD(obj.bed))
#'
bed_randomSVD <- function(
  obj.bed,
  fun.scaling = bed_scaleBinom,
  ind.row = rows_along(obj.bed),
  ind.col = cols_along(obj.bed),
  k = 10,
  tol = 1e-4,
  verbose = FALSE,
  ncores = 1
) {

  big_randomSVD(obj.bed$light, fun.scaling, ind.row, ind.col,
                k, tol, verbose, ncores,
                bed_prodVec, bed_cprodVec)
}

################################################################################

#' @rdname snp_autoSVD
#'
#' @export
bed_autoSVD <- function(obj.bed,
                        ind.row = rows_along(obj.bed),
                        ind.col = cols_along(obj.bed),
                        fun.scaling = bed_scaleBinom,
                        thr.r2 = 0.2,
                        size = 100 / thr.r2,
                        k = 10,
                        roll.size = 50,
                        int.min.size = 20,
                        alpha.tukey = 0.05,
                        min.mac = 10,
                        max.iter = 5,
                        ncores = 1,
                        verbose = TRUE) {

  infos.chr <- obj.bed$map$chromosome
  infos.pos <- obj.bed$map$physical.pos

  check_args()

  # Verbose?
  printf2 <- function(...) if (verbose) printf(...)

  # First clumping
  if (is.na(thr.r2)) {
    printf2("\nSkipping clumping.\n")
    ind.keep <- ind.col
  } else {
    printf2("\nPhase of clumping (on MAC) at r^2 > %s.. ", thr.r2)
    ind.keep <- bed_clumping(obj.bed,
                             ind.row = ind.row,
                             exclude = setdiff(cols_along(obj.bed), ind.col),
                             thr.r2 = thr.r2,
                             size = size,
                             ncores = ncores)
    printf2("keep %d variants.\n", length(ind.keep))
  }

  if (min.mac > 0) {
    mac <- bed_MAF(obj.bed, ind.row, ind.keep, ncores = ncores)$mac
    mac.nok <- (mac < min.mac)
    printf2("Discarding %d variant%s with MAC < %s.\n", sum(mac.nok),
            `if`(sum(mac.nok) > 1, "s", ""), min.mac)
    ind.keep <- ind.keep[!mac.nok]
  }

  iter <- 0L
  LRLDR <- LD.wiki34[0, 1:3]
  repeat {
    printf2("\nIteration %d:\n", iter <- iter + 1L)
    printf2("Computing SVD..\n")
    # SVD
    obj.svd <- bed_randomSVD(obj.bed,
                             fun.scaling = fun.scaling,
                             ind.row = ind.row,
                             ind.col = ind.keep,
                             k = k,
                             ncores = ncores)

    if (iter > max.iter) {
      printf2("Maximum number of iterations reached.\n")
      break
    }

    # check for outlier variants
    S.col <- sqrt(bigutilsr::dist_ogk(obj.svd$v))
    # roll mean to get only consecutive outliers (by chromosome)
    ind.split <- split(seq_along(S.col), infos.chr[ind.keep])
    S2.col <- double(length(S.col))
    for (ind in ind.split)
      S2.col[ind] <- bigutilsr::rollmean(S.col[ind], roll.size)
    S2.col.thr <- bigutilsr::tukey_mc_up(S2.col, alpha = alpha.tukey)
    ind.col.excl <- which(S2.col > S2.col.thr)
    printf2("%d outlier variant%s detected..\n", length(ind.col.excl),
            `if`(length(ind.col.excl) > 1, "s", ""))

    # Stop or continue?
    if (length(ind.col.excl) > 0) {

      ind.keep <- ind.keep[-ind.col.excl]

      # Detection of long-range LD regions
      if (!is.null(infos.pos)) {
        # Regroup them by intervals to return long-range LD regions
        ind.range <- getIntervals(ind.col.excl, n = int.min.size)
        printf2("%d long-range LD region%s detected..\n", nrow(ind.range),
                `if`(nrow(ind.range) > 1, "s", ""))
        for (i in rows_along(ind.range)) {
          seq.range <- seq2(ind.range[i, ])
          seq.range.chr <- infos.chr[ind.keep[seq.range]]
          chr <- stats::median(seq.range.chr)  ## to get mode
          in.chr <- (infos.chr[ind.keep[seq.range]] == chr)
          range.in.chr <- range(infos.pos[ind.keep[seq.range[in.chr]]])
          LRLDR[nrow(LRLDR) + 1L, ] <- c(chr, range.in.chr)
        }
      }
    } else {
      printf2("\nConverged!\n")
      break
    }
  }

  structure(
    obj.svd,
    subset = ind.keep,
    lrldr = LRLDR[with(LRLDR, order(Chr, Start, Stop)), ]
  )
}

################################################################################