R/prs_pipeline.R

Defines functions prs_pipeline

Documented in prs_pipeline

#' @title User-friendly pipeline to retrieve variants and convert them into a polygenic risk score
#'
#' @description A pipeline creating a polygenic risk score from start to finish.
#'
#' @param trait Expects a regular expression to find the trait of interest
#' @param trait_list An object generated by init_gwas_db(). If NULL, init_gwas_db() will be run internally.
#' @param pQTL An object generated by get_pQTLs(). Only one of trait or pQTL should be provided.
#' @param is_cis If TRUE, only includes cis pQTLs in the output.
#' @param chr_files Filenames (in order) for the chromosome-wise .vcf files
#' @param autosomes If TRUE, only extracts variants from the autosomes.
#' @param range Variants are extracted from the start position until start position + range (in bp).
#' @param grch37 If TRUE, the build is grch37; if FALSE, grch38 is used.
#' @param keep_all If TRUE, keeps all variants in the search range, even those not in gwas_info.
#' @param binary_outcome Set to TRUE for binary traits, and FALSE for continuous outcomes (including pQTLs).
#' @param ... Any other argument to pass to create_prs().
#'
#' @return A list containing several data.frames with all relevant information. The risk score is stored in element 'prs'.
#' @examples
#' # vte_prs <- prs_pipeline(trait='venous thromboembolism|deep vein thrombosis|pulmonary embolism',
#' # trait_list=trait_list,
#' # chr_files=vcf_files_myo2,
#' # autosomes=FALSE)
#' @export

prs_pipeline <- function(trait=NULL,
                         trait_list=NULL,
                         pQTL=NULL,
                         is_cis=TRUE, # if FALSE, returns also trans pQTLs (>1Mb from TSS)
                         chr_files=NULL, # provide vector of filenames (including folder)
                         autosomes=TRUE, # if FALSE, sex chromosomes are also searched
                         range=0, # search for variants from start position+range (in bp)
                         grch37=TRUE, # if FALSE, the build is grch38
                         keep_all=FALSE, # if FALSE, only those rsids provided are kept
                         binary_outcome=TRUE,
                         ...){
  starttime <- Sys.time()
  cat('> Retrieving information for trait/pQTL of interest.\n')
  if(!is.null(trait)&!is.null(pQTL)){stop('Trait and pQTL cannot both be non-NULL.')}
  if(!is.null(trait)){
    g <- get_trait_variants(trait, trait_list=trait_list)
  }
  if(!is.null(pQTL)){
    g <- get_pQTLs(pQTL, is_cis=is_cis)
    binary_outcome <- FALSE
  }
  cat('> Initiating extraction of variants from chromosome files.\n')
  if(is.null(chr_files)){stop('Filenames for chromosome .vcf files should be provided as chr_files.')}
  v <- extract_variants(chr_files=chr_files,
                        gwas_info=g,
                        autosomes=autosomes,
                        range=range,
                        grch37=grch37,
                        keep_all=keep_all)
  cat('> Constructing polygenic risk score from variants.\n')
  prs <- create_prs(variant_data=v,
                    gwas_info=g,
                    binary_outcome=binary_outcome,
                    ...)
  endtime <- Sys.time()
  cat('> Running the full pipeline took',difftime(endtime,starttime,units='secs'),'seconds.')
  return(prs)
}
vincent10kd/polygenic documentation built on Feb. 25, 2024, 10:17 a.m.