R/pathwas_ld_clump.R

Defines functions random_string pathwas_ld_clump_local ld_clump_api pathwas_ld_clump

Documented in ld_clump_api pathwas_ld_clump pathwas_ld_clump_local

#' Perform LD clumping on SNP data
#'
#' Uses PLINK clumping method, where SNPs in LD within a particular window will be pruned.
#' The SNP with the lowest p-value is retained.
#'
#' @details
#' FOR PATHWAS PURPOSES:
#' All functions here are copied directly from the ieugwasr package (https://github.com/MRCIEU/ieugwasr).
#' They have been slightly edited to allow for definition of directory for storing temporary files.
#' All other functionality remains unchanged.
#'
#' ORIGINAL DESCRIPTION STARTS HERE:
#' This function interacts with the OpenGWAS API, which houses LD reference panels
#' for the 5 super-populations in the 1000 genomes reference panel.
#' It includes only bi-allelic SNPs with MAF > 0.01, so it's quite possible that
#' a variant you want to include in the clumping process will be absent.
#' If it is absent, it will be automatically excluded from the results.
#'
#' You can check if your variants are present in the LD reference panel using
#' [`ld_reflookup()`].
#'
#' This function does put load on the OpenGWAS servers, which makes life more
#' difficult for other users. We have implemented a method and made available
#' the LD reference panels to perform clumping locally, see
#' [`ld_clump()`] and related vignettes for details.
#'
#' @param dat Dataframe. Must have a variant name column (`rsid`) and pval column called `pval`.
#' If `id` is present then clumping will be done per unique id.
#' @param clump_kb Clumping kb window. Default is very strict, `10000`
#' @param clump_r2 Clumping r2 threshold. Default is very strict, `0.001`
#' @param clump_p Clumping sig level for index variants. Default = `1` (i.e. no threshold)
#' @param pop Super-population to use as reference panel. Default = `"EUR"`.
#' Options are `"EUR"`, `"SAS"`, `"EAS"`, `"AFR"`, `"AMR"`.
#' `'legacy'` also available - which is a previously used verison of the EUR
#' panel with a slightly different set of markers
#' @param access_token Google OAuth2 access token. Used to authenticate level of access to data
#' @param bfile If this is provided then will use the API. Default = `NULL`
#' @param plink_bin If `NULL` and `bfile` is not `NULL` then will detect
#' packaged plink binary for specific OS. Otherwise specify path to plink binary.
#' Default = `NULL`
#'
#' @export
#' @return Data frame
pathwas_ld_clump <- function(dat=NULL, clump_kb=10000, clump_r2=0.001, clump_p=0.99,
                     pop = "EUR", access_token=NULL, bfile=NULL, plink_bin=NULL, def_tmpDir=tempdir())
{

  stopifnot("rsid" %in% names(dat))
  stopifnot(is.data.frame(dat))

  if(is.null(bfile))
  {
    message("Please look at vignettes for options on running this locally if you need to run many instances of this command.")
  }

  if(! "pval" %in% names(dat))
  {
    if( "p" %in% names(dat))
    {
      warning("No 'pval' column found in dat object. Using 'p' column.")
      dat[["pval"]] <- dat[["p"]]
    } else {
      warning("No 'pval' column found in dat object. Setting p-values for all SNPs to clump_p parameter.")
      dat[["pval"]] <- clump_p
    }
  }

  if(! "id" %in% names(dat))
  {
    dat$id <- random_string(1)
  }

  if(is.null(bfile))
  {
    access_token = check_access_token()
  }

  ids <- unique(dat[["id"]])
  res <- list()
  for(i in 1:length(ids))
  {
    x <- subset(dat, dat[["id"]] == ids[i])
    if(nrow(x) == 1)
    {
      message("Only one SNP for ", ids[i])
      res[[i]] <- x
    } else {
      message("Clumping ", ids[i], ", ", nrow(x), " variants, using ", pop, " population reference")
      if(is.null(bfile))
      {
        res[[i]] <- ld_clump_api(x, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p, pop=pop, access_token=access_token)
      } else {
        res[[i]] <- pathwas_ld_clump_local(x, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p, bfile=bfile, plink_bin=plink_bin, def_tmpDir=def_tmpDir)
      }
    }
  }
  res <- dplyr::bind_rows(res)
  return(res)
}


#' Perform clumping on the chosen variants using through API
#'
#' @param dat Dataframe. Must have a variant name column (`variant`) and pval column called `pval`.
#' If `id` is present then clumping will be done per unique id.
#' @param clump_kb Clumping kb window. Default is very strict, `10000`
#' @param clump_r2 Clumping r2 threshold. Default is very strict, `0.001`
#' @param clump_p Clumping sig level for index variants. Default = `1` (i.e. no threshold)
#' @param pop Super-population to use as reference panel. Default = `"EUR"`.
#' Options are `"EUR"`, `"SAS"`, `"EAS"`, `"AFR"`, `"AMR"`
#' @param access_token Google OAuth2 access token. Used to authenticate level of access to data
#' @return Data frame of only independent variants
ld_clump_api <- function(dat, clump_kb=10000, clump_r2=0.1, clump_p, pop="EUR", access_token=check_access_token())
{
  res <- api_query('ld/clump',
                   query = list(
                     rsid = dat[["rsid"]],
                     pval = dat[["pval"]],
                     pthresh = clump_p,
                     r2 = clump_r2,
                     kb = clump_kb,
                     pop = pop
                   ),
                   access_token=access_token
  ) %>% get_query_content()
  y <- subset(dat, !dat[["rsid"]] %in% res)
  if(nrow(y) > 0)
  {
    message("Removing ", length(y[["rsid"]]), " of ", nrow(dat), " variants due to LD with other variants or absence from LD reference panel")
  }
  return(subset(dat, dat[["rsid"]] %in% res))
}


#' Wrapper for clump function using local plink binary and ld reference dataset
#'
#' @param dat Dataframe. Must have a variant name column (`variant`) and pval column called `pval`.
#' If `id` is present then clumping will be done per unique id.
#' @param clump_kb Clumping kb window. Default is very strict, `10000`
#' @param clump_r2 Clumping r2 threshold. Default is very strict, `0.001`
#' @param clump_p Clumping sig level for index variants. Default = `1` (i.e. no threshold)
#' @param bfile If this is provided then will use the API. Default = `NULL`
#' @param plink_bin Specify path to plink binary. Default = `NULL`.
#' See \url{https://github.com/explodecomputer/plinkbinr} for convenient access to plink binaries
#' @importFrom utils read.table
#' @importFrom utils write.table
#' @export
#' @return data frame of clumped variants
pathwas_ld_clump_local <- function(dat, clump_kb, clump_r2, clump_p, bfile, plink_bin, def_tmpDir)
{

  # Make textfile
  shell <- ifelse(Sys.info()['sysname'] == "Windows", "cmd", "sh")
  fn <- tempfile(tmpdir = def_tmpDir)
  write.table(data.frame(SNP=dat[["rsid"]], P=dat[["pval"]]), file=fn, row.names=F, col.names=T, quote=F)

  fun2 <- paste0(
    shQuote(plink_bin, type=shell),
    " --bfile ", shQuote(bfile, type=shell),
    " --clump ", shQuote(fn, type=shell),
    " --clump-p1 ", clump_p,
    " --clump-r2 ", clump_r2,
    " --clump-kb ", clump_kb,
    " --out ", shQuote(fn, type=shell)
  )
  system(fun2)
  res <- read.table(paste(fn, ".clumped", sep=""), header=T)
  unlink(paste(fn, "*", sep=""))
  y <- subset(dat, !dat[["rsid"]] %in% res[["SNP"]])
  if(nrow(y) > 0)
  {
    message("Removing ", length(y[["rsid"]]), " of ", nrow(dat), " variants due to LD with other variants or absence from LD reference panel")
  }
  return(subset(dat, dat[["rsid"]] %in% res[["SNP"]]))
}

random_string <- function(n=1, len=6)
{
  randomString <- c(1:n)
  for (i in 1:n)
  {
    randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
                                    len, replace=TRUE),
                             collapse="")
  }
  return(randomString)
}
Sabor117/PathWAS documentation built on Nov. 29, 2024, 7:44 a.m.