R/extract_variants.R

Defines functions extract_variants

Documented in extract_variants

#' @title Targeted extraction of variants from .vcf files using Tabix files
#'
#' @description Performs targeted extraction of variants from chromosome .vcf files using Tabix index (.tbi) files.
#'
#' @param chr_files Filenames (in order) for the chromosome-wise .vcf files
#' @param gwas_info An object generated by get_trait_variants() or get_pQTLs(). Alternatively, a character vector of rsids.
#' @param autosomes If TRUE, only extracts variants from the autosomes.
#' @param range Variants are extracted from the start position until start position + range (in bp).
#' @param grch37 If TRUE, the build is grch37; if FALSE, grch38 is used.
#' @param keep_all If TRUE, keeps all variants in the search range, even those not in gwas_info.
#'
#' @return A list with two data.frames: 'gt', containing the genotype data in number of risk alleles (0/1/2), and 'fix',
#' containing information on the variants: rsid, chromosome, position, reference and alternative allele,
#' and imputation quality information (e.g. DR2).
#' @examples
#' \donttest{TG_gwas_info <- get_trait_variants('thrombin generation',trait_list=NULL,ask_about_efo=FALSE)}
#' # TG_variants <- extract_variants(chromosome_files, TG_gwas_info)
#' @export
#' @importFrom biomaRt "useMart" "getBM"
#' @importFrom Rsamtools "TabixFile" "seqnamesTabix"
#' @importFrom SummarizedExperiment "rowRanges"
#' @importFrom IRanges "IRanges"
#' @importFrom GenomicRanges "GRanges"
#' @importFrom VariantAnnotation "readVcf" "info"

extract_variants <- function(chr_files, # chromosome vcf files
                             gwas_info, # output from get_trait_variants/get_pQTLs or a vector of rsids
                             autosomes=TRUE, # will only consider autosomes when TRUE
                             range=0, # search for variants from start position+range (in bp)
                             grch37=TRUE, # if FALSE, the build is grch38
                             keep_all=FALSE)# if TRUE, keeps all variants in the search range
{
  start_time <- Sys.time()
  files <- chr_files
  g <- gwas_info
  if(class(g)=='character'){
    g <- data.frame(variant_id=g)
  }
  if(grch37==TRUE){
    cat('> Using the grch37 build to retrieve positions.\n')
    usemart <- biomaRt::useMart(biomart = "ENSEMBL_MART_SNP", host = "grch37.ensembl.org",
                        dataset = "hsapiens_snp")
    posdf <- biomaRt::getBM(attributes = c("refsnp_id", "chr_name", "chrom_start"),
                      filters = c("snp_filter"),
                      values = list(snp_filter = g$variant_id), mart = usemart)
  }
  else{
    cat('> Using the grch38 build to retrieve positions.\n')
    usemart <- biomaRt::useMart(biomart = "ENSEMBL_MART_SNP",
                        dataset = "hsapiens_snp")
    posdf <- biomaRt::getBM(attributes = c("refsnp_id", "chr_name", "chrom_start"),
                      filters = c("snp_filter"),
                      values = list(snp_filter = g$variant_id), mart = usemart)
  }
  valid_chroms <- seq(1,22)
  if(autosomes==FALSE){
    valid_chroms <- c(valid_chroms,'X','Y')
  }
  posdf <- posdf[posdf$chr_name%in%valid_chroms,]
  if(nrow(posdf)==0){
    stop('No variants found in the specified chromosomes. Try autosomes=FALSE.\n')
  }
  if(autosomes==FALSE){
    posdf$chr_name[which(posdf$chr_name=='X')] <- 23
    posdf$chr_name[which(posdf$chr_name=='Y')] <- 24
  }
  posdf <- posdf[order(as.numeric(posdf$chr_name),decreasing=FALSE),]
  chr_list <- chr_list2 <- list()
  for(chrom in 1:length(unique(posdf$chr_name))){
    chr <- unique(posdf$chr_name)[chrom]
    chr2 <- chr
    tabix <- Rsamtools::TabixFile(files[as.numeric(chr)])
    if(chr%in%c('23','24')){
      chr2 <- c(seq(1,22),c('X','Y'))[as.numeric(chr)]
    }
    tabix_seq <- Rsamtools::seqnamesTabix(tabix)
    if(tabix_seq!=chr2){
      warning('Tabix seqnames do not match the chromosome name (',chr2,').')
    }
    cat('> Extracting variants in chromosome',chr2,'\n')
    d <- posdf[posdf$chr_name==chr,]
    res <- lapply(1:nrow(d),function(s){
      cat('** Searching for variant',s,'of',nrow(d),'\n')
      rng <- GenomicRanges::GRanges(seqnames=tabix_seq,
                     ranges=IRanges::IRanges(d$chrom_start[s],d$chrom_start[s]+range))
      ext <- VariantAnnotation::readVcf(tabix, genome=paste0('chr',chr2), param=rng)
      fix <- data.frame(SummarizedExperiment::rowRanges(ext))
      info <- VariantAnnotation::info(ext)
      info <- cbind(fix,info)
      ext <- VariantAnnotation::genotypeToSnpMatrix(ext)
      ext <- as(ext$genotype,'numeric')
      list(ext=ext, info=info)
    })
    chr_list[[chrom]] <- do.call('cbind',lapply(res,function(x) x$ext))
    chr_list2[[chrom]] <- do.call('rbind',lapply(res,function(x) x$info))
  }
  out_df <- data.frame(do.call('cbind',chr_list))
  out_df2 <- data.frame(do.call('rbind',chr_list2))
  out_df2[,3:6] <- NULL
  out_df2$ID <- rownames(out_df2)
  colnames(out_df2)[1:2] <- c('CHROM','POS')
  out_df2 <- out_df2[,c(ncol(out_df2),1:(ncol(out_df2)-1))]
  out_df2$ALT <- unlist(lapply(out_df2$ALT, function(x) as.character(x[[1]])))
  found <- sum(unique(g$variant_id)%in%colnames(out_df))
  if(keep_all==FALSE){
    out_df <- out_df[,which(colnames(out_df)%in%g$variant_id)]
    out_df2 <- out_df2[which(rownames(out_df2)%in%g$variant_id),]
  }
  cat('> Succesfully retrieved',found,'of',length(unique(g$variant_id)),'variants.\n')
  end_time <- Sys.time()
  cat('> Extraction of the variants took',difftime(end_time,start_time,units='secs'),'seconds.\n')
  return(list(gt=out_df,
              fix=out_df2))
}
vincent10kd/polygenic documentation built on Feb. 25, 2024, 10:17 a.m.