R/regenie.R

Defines functions plot_manhattan_regenie read_regenie

Documented in plot_manhattan_regenie read_regenie

EXPECTED_REGENIE_COLNAMES <- c(
  'CHROM',
  'GENPOS',
  'ID',
  'ALLELE0',
  'ALLELE1',
  'A1FREQ',
  'INFO',
  'N',
  'TEST',
  'BETA',
  'SE',
  'CHISQ',
  'LOG10P',
  'EXTRA'
)

# Manhattan plots ---------------------------------------------------------

#' Read REGENIE results files into R
#'
#' Reads \code{.regenie} GWAS results files into a data table in R, filtering on
#' minor allele frequency (appended) and info score. Also appends a p value
#' column (\code{P}).
#'
#' @param path File path to directory containing \code{.regenie} files. There
#'   should be one file for each of 22 chromosomes (i.e. excluding sex
#'   chromosomes), otherwise and error is raised. An error is also raised if the
#'   column names do not match those expected for REGENIE: 'CHROM', 'GENPOS',
#'   'ID', 'ALLELE0', 'ALLELE1', 'A1FREQ', 'INFO', 'N', 'TEST', 'BETA', 'SE',
#'   'CHISQ', 'LOG10P', 'EXTRA'.
#' @param maf_threshold Minor allele frequencies below this threshold will be
#'   removed. Default is 0.01.
#' @param info_threshold Info scores below this threshold will be removed.
#'   Default is 0.9.
#'
#' @return A data table.
#' @export
read_regenie <- function(path,
                         maf_threshold = 0.01,
                         info_threshold = 0.9) {
  start_time <- proc.time()

  # get regenie results file paths
  regenie_filepaths <- fs::dir_ls(path = path,
                                  glob = "*.regenie")

  # check that .regenie files for all 22 chromosomes are present (excludes sex chromosomes)
  assertthat::assert_that(
    length(regenie_filepaths) == 22,
    msg = paste0(
      "Error! 22 files expected, but only ",
      length(regenie_filepaths),
      " .regenie files detected at `path`"
    )
  )

  # read into R
  message("Reading regenie files into R")
  regenie_results <- regenie_filepaths %>%
    purrr::map(data.table::fread) %>%
    dplyr::bind_rows()

  # check that expected column names are present
  assertthat::assert_that(
    all(names(regenie_results) == EXPECTED_REGENIE_COLNAMES),
    msg = paste0(
      "Error! Unexpected column names. Expected: ",
      stringr::str_c(
        EXPECTED_REGENIE_COLNAMES,
        sep = "",
        collapse = ", "
      ),
      ". Got: ",
      stringr::str_c(names(regenie_results),
                     sep = "",
                     collapse = ", ")
    )
  )

  message("Appending P and MAF cols")
  # append p and maf columns
  regenie_results <- regenie_results %>%
    dplyr::mutate("P" = 1 / (10 ^ .data[["LOG10P"]]),
                  "MAF" = ifelse(.data[["A1FREQ"]] > 0.5,
                                 yes = 1 - .data[["A1FREQ"]],
                                 no = .data[["A1FREQ"]]))

  # filter on maf and info
  message("Filtering on MAF and INFO")
  regenie_results <- regenie_results %>%
    dplyr::filter(.data[["MAF"]] > !!maf_threshold &
                    .data[["INFO"]] > !!info_threshold) %>%

    # select/rename required cols (match defaults for `qqman::manhattan()`)
    dplyr::select(CHR = CHROM,
                  BP = GENPOS,
                  P,
                  SNP = ID,
                  MAF,
                  INFO)

  # return result
  message("Success!")
  time_taken_message(start_time)

  return(regenie_results)
}

# Plots -------------------------------------------------------------------

#' Generate a Manhattan plot
#'
#' A thin wrapper around \code{\link[qqman]{manhattan}} with preferred default
#' settings.
#'
#' @inheritParams qqman::manhattan
#'
#' @return A Manhattan plot
#' @export
plot_manhattan_regenie <- function(x,
                                   chr = "CHR",
                                   bp = "BP",
                                   p = "P",
                                   snp = "SNP",
                                   col = c("blue4", "orange3"),
                                   chrlabs = NULL,
                                   suggestiveline = FALSE,
                                   genomewideline = -log10(5e-08),
                                   highlight = NULL,
                                   logp = TRUE,
                                   annotatePval = NULL,
                                   annotateTop = TRUE,
                                   cex = 0.4,
                                   cex.axis = 0.6,
                                   ...) {
  qqman::manhattan(x,
                   chr = chr,
                   bp = bp,
                   p = p,
                   snp = snp,
                   col = col,
                   chrlabs = chrlabs,
                   suggestiveline = suggestiveline,
                   genomewideline = genomewideline,
                   highlight = highlight,
                   logp = logp,
                   annotatePval = annotatePval,
                   annotateTop = annotateTop,
                   cex = cex,
                   cex.axis = cex.axis,
                   ...)
}
rmgpanw/rlgenetics documentation built on Jan. 6, 2022, 12:32 a.m.