R/loadVCF.R

#' loadVcfFileForGene
#'
#' Function to load vcf file for a particular gene and transcrit, given a transcript object as
#' returned by VEP EPI. Requires: VariantAnnotation, BiocGenerics
#' @param vcfFile File name to load
#' @param transcript VEP transcript object or list. Needs to contain 'id', 'seq_region_name','start' and 'end' elements
#' @param includeFilteredSites Return filtered-out sites (i.e. sites marked with anything other than PASS on vcf). Defaults to FALSE
#' @param n Max number of records to load. Set to -1 for no limit.
#' @param refVersion Reference version. Defaults to "GRCh37".
#' @param returnGenotypes Return genotype data. Default to FALSE.
#' @keywords cats
#' @export
#' @examples
#' exacdata<- loadVcfFileForGene('exac.vcf',transcript)

loadVcfFileForGene <- function(vcfFile, transcript,
                               includeFilteredSites=F,
                               n=-1,
                               refVersion="GRCh37",
                               returnGenotypes=F){
  chr <- transcript$seq_region_name
  start <- transcript$start
  end <- transcript$end

  if (start > end) {
    tmp=end
    end=start
    start=tmp
  }

  vdata <- loadVcfFileForRegion(vcfFile, chr, start, end, includeFilteredSites,n,refVersion,returnGenotypes)
  vdata
}

#' loadVcfFileForRegion
#'
#' Function to load vcf file for a particular genomic region
#' @param vcfFile File name to load
#' @param chr Chromosome/contig to load
#' @param start Initial genomic position to load
#' @param end Final genomic position to load
#' @param includeFilteredSites Return filtered-out sites (i.e. sites marked with anything other than PASS on vcf). Defaults to FALSE
#' @param n Max number of records to load. Set to -1 for no limit.
#' @param refVersion Reference version. Defaults to "GRCh37".
#' @param returnGenotypes Return genotype data. Default to FALSE.
#' @importFrom GenomicRanges as.data.frame
#' @keywords cats
#' @export
#' @examples
#' exacdata<- loadVcfFileForRegion('exac.vcf',"1",12345600,13345600)

loadVcfFileForRegion <- function(vcfFile, chr, start, end,
                               includeFilteredSites=F,
                               n=-1,
                               refVersion="GRCh37",
                               returnGenotypes=F){

  vRanges <- GRanges(as.character(chr),
                     IRanges(start=start, end=end,names = as.character(chr)))
  tf <- TabixFile(vcfFile)
  vdat <- readVcf(tf,refVersion, vRanges,row.names=F)
  r <- rowRanges(vdat)
  # clobber row names in case there are duplicate row entries which will make data frame conversion fail
  #names(r) <- as.character(1:length(names(r)))
  vRanges <- as.data.frame(r)
  vInfo <- info(vdat)
  vData <- cbind(vRanges,vInfo)
  if (!includeFilteredSites) {
    vData <- vData[vData$FILTER=="PASS",]
  }
  if (n>0) vData<-vData[1:n,]
  if (returnGenotypes) {
    list(siteData=vData,genotypeData=geno(vdat)$GT, sampleData=colData(vdat))
  }
  else vData
}
gdelangel/genomicRTools documentation built on May 16, 2019, 11:14 p.m.