#' @title Targeted extraction of variants from .vcf files using Tabix files
#'
#' @description Performs targeted extraction of variants from chromosome .vcf files using Tabix index (.tbi) files.
#'
#' @param chr_files Filenames (in order) for the chromosome-wise .vcf files
#' @param gwas_info An object generated by get_trait_variants() or get_pQTLs(). Alternatively, a character vector of rsids.
#' @param autosomes If TRUE, only extracts variants from the autosomes.
#' @param range Variants are extracted from the start position until start position + range (in bp).
#' @param grch37 If TRUE, the build is grch37; if FALSE, grch38 is used.
#' @param keep_all If TRUE, keeps all variants in the search range, even those not in gwas_info.
#'
#' @return A list with two data.frames: 'gt', containing the genotype data in number of risk alleles (0/1/2), and 'fix',
#' containing information on the variants: rsid, chromosome, position, reference and alternative allele,
#' and imputation quality information (e.g. DR2).
#' @examples
#' \donttest{TG_gwas_info <- get_trait_variants('thrombin generation',trait_list=NULL,ask_about_efo=FALSE)}
#' # TG_variants <- extract_variants(chromosome_files, TG_gwas_info)
#' @export
#' @importFrom biomaRt "useMart" "getBM"
#' @importFrom Rsamtools "TabixFile" "seqnamesTabix"
#' @importFrom SummarizedExperiment "rowRanges"
#' @importFrom IRanges "IRanges"
#' @importFrom GenomicRanges "GRanges"
#' @importFrom VariantAnnotation "readVcf" "info"
extract_variants <- function(chr_files, # chromosome vcf files
gwas_info, # output from get_trait_variants/get_pQTLs or a vector of rsids
autosomes=TRUE, # will only consider autosomes when TRUE
range=0, # search for variants from start position+range (in bp)
grch37=TRUE, # if FALSE, the build is grch38
keep_all=FALSE)# if TRUE, keeps all variants in the search range
{
start_time <- Sys.time()
files <- chr_files
g <- gwas_info
if(class(g)=='character'){
g <- data.frame(variant_id=g)
}
if(grch37==TRUE){
cat('> Using the grch37 build to retrieve positions.\n')
usemart <- biomaRt::useMart(biomart = "ENSEMBL_MART_SNP", host = "grch37.ensembl.org",
dataset = "hsapiens_snp")
posdf <- biomaRt::getBM(attributes = c("refsnp_id", "chr_name", "chrom_start"),
filters = c("snp_filter"),
values = list(snp_filter = g$variant_id), mart = usemart)
}
else{
cat('> Using the grch38 build to retrieve positions.\n')
usemart <- biomaRt::useMart(biomart = "ENSEMBL_MART_SNP",
dataset = "hsapiens_snp")
posdf <- biomaRt::getBM(attributes = c("refsnp_id", "chr_name", "chrom_start"),
filters = c("snp_filter"),
values = list(snp_filter = g$variant_id), mart = usemart)
}
valid_chroms <- seq(1,22)
if(autosomes==FALSE){
valid_chroms <- c(valid_chroms,'X','Y')
}
posdf <- posdf[posdf$chr_name%in%valid_chroms,]
if(nrow(posdf)==0){
stop('No variants found in the specified chromosomes. Try autosomes=FALSE.\n')
}
if(autosomes==FALSE){
posdf$chr_name[which(posdf$chr_name=='X')] <- 23
posdf$chr_name[which(posdf$chr_name=='Y')] <- 24
}
posdf <- posdf[order(as.numeric(posdf$chr_name),decreasing=FALSE),]
chr_list <- chr_list2 <- list()
for(chrom in 1:length(unique(posdf$chr_name))){
chr <- unique(posdf$chr_name)[chrom]
chr2 <- chr
tabix <- Rsamtools::TabixFile(files[as.numeric(chr)])
if(chr%in%c('23','24')){
chr2 <- c(seq(1,22),c('X','Y'))[as.numeric(chr)]
}
tabix_seq <- Rsamtools::seqnamesTabix(tabix)
if(tabix_seq!=chr2){
warning('Tabix seqnames do not match the chromosome name (',chr2,').')
}
cat('> Extracting variants in chromosome',chr2,'\n')
d <- posdf[posdf$chr_name==chr,]
res <- lapply(1:nrow(d),function(s){
cat('** Searching for variant',s,'of',nrow(d),'\n')
rng <- GenomicRanges::GRanges(seqnames=tabix_seq,
ranges=IRanges::IRanges(d$chrom_start[s],d$chrom_start[s]+range))
ext <- VariantAnnotation::readVcf(tabix, genome=paste0('chr',chr2), param=rng)
fix <- data.frame(SummarizedExperiment::rowRanges(ext))
info <- VariantAnnotation::info(ext)
info <- cbind(fix,info)
ext <- VariantAnnotation::genotypeToSnpMatrix(ext)
ext <- as(ext$genotype,'numeric')
list(ext=ext, info=info)
})
chr_list[[chrom]] <- do.call('cbind',lapply(res,function(x) x$ext))
chr_list2[[chrom]] <- do.call('rbind',lapply(res,function(x) x$info))
}
out_df <- data.frame(do.call('cbind',chr_list))
out_df2 <- data.frame(do.call('rbind',chr_list2))
out_df2[,3:6] <- NULL
out_df2$ID <- rownames(out_df2)
colnames(out_df2)[1:2] <- c('CHROM','POS')
out_df2 <- out_df2[,c(ncol(out_df2),1:(ncol(out_df2)-1))]
out_df2$ALT <- unlist(lapply(out_df2$ALT, function(x) as.character(x[[1]])))
found <- sum(unique(g$variant_id)%in%colnames(out_df))
if(keep_all==FALSE){
out_df <- out_df[,which(colnames(out_df)%in%g$variant_id)]
out_df2 <- out_df2[which(rownames(out_df2)%in%g$variant_id),]
}
cat('> Succesfully retrieved',found,'of',length(unique(g$variant_id)),'variants.\n')
end_time <- Sys.time()
cat('> Extraction of the variants took',difftime(end_time,start_time,units='secs'),'seconds.\n')
return(list(gt=out_df,
fix=out_df2))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.