R/vcfToSNVcatalogue.R

Defines functions vcfToSNVcatalogue

Documented in vcfToSNVcatalogue

#' VCF to SNV catalogue
#' 
#' Convert a vcf file containing SNV to SNV 96 channel trinuclotide context catalogue. The VCF file should containt the SNV of a single sample.
#' 
#' @param vcfFilename path to input VCF (file must be tabix indexed)
#' @param genome.v either "hg38" (will load BSgenome.Hsapiens.UCSC.hg38), "hg19" (will load BSgenome.Hsapiens.1000genomes.hs37d5), mm10 (will load BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10) or canFam3 (will load BSgenome.Cfamiliaris.UCSC.canFam3::BSgenome.Cfamiliaris.UCSC.canFam3)
#' @return returns the SNV catalogue for the given sample
#' @keywords vcf SNV
#' @export
#' @examples
#' file_subs <- "subs.vcf"
#' res <- vcfToSNVcatalogue(file_subs,genome.v = "hg38")
vcfToSNVcatalogue <- function(vcfFilename, genome.v="hg19") {
  
  # check that file exists
  if (!file.exists(vcfFilename)){
    message("[error vcfToSNVcatalogue] vcfFilename file not found: ",vcfFilename)
    return(NULL)
  }
  
  if(genome.v=="hg19"){
    expected_chroms <- paste0(c(seq(1:22),"X","Y"))
    genomeSeq <- BSgenome.Hsapiens.1000genomes.hs37d5::BSgenome.Hsapiens.1000genomes.hs37d5
  }else if(genome.v=="hg38"){
    expected_chroms <- paste0("chr",c(seq(1:22),"X","Y"))
    genomeSeq <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
  }else if(genome.v=="mm10"){
    expected_chroms <- paste0("chr",c(seq(1:19),"X","Y"))
    genomeSeq <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
  }else if(genome.v=="canFam3"){
    expected_chroms <- paste0("chr",c(seq(1:38),"X")) 
    genomeSeq <- BSgenome.Cfamiliaris.UCSC.canFam3::BSgenome.Cfamiliaris.UCSC.canFam3
  }
 
  mut.order <- c("A[C>A]A","A[C>A]C","A[C>A]G","A[C>A]T","C[C>A]A","C[C>A]C","C[C>A]G","C[C>A]T","G[C>A]A","G[C>A]C","G[C>A]G","G[C>A]T","T[C>A]A","T[C>A]C","T[C>A]G","T[C>A]T","A[C>G]A","A[C>G]C","A[C>G]G","A[C>G]T","C[C>G]A","C[C>G]C","C[C>G]G","C[C>G]T","G[C>G]A","G[C>G]C","G[C>G]G","G[C>G]T","T[C>G]A","T[C>G]C","T[C>G]G","T[C>G]T","A[C>T]A","A[C>T]C","A[C>T]G","A[C>T]T","C[C>T]A","C[C>T]C","C[C>T]G","C[C>T]T","G[C>T]A","G[C>T]C","G[C>T]G","G[C>T]T","T[C>T]A","T[C>T]C","T[C>T]G","T[C>T]T","A[T>A]A","A[T>A]C","A[T>A]G","A[T>A]T","C[T>A]A","C[T>A]C","C[T>A]G","C[T>A]T","G[T>A]A","G[T>A]C","G[T>A]G","G[T>A]T","T[T>A]A","T[T>A]C","T[T>A]G","T[T>A]T","A[T>C]A","A[T>C]C","A[T>C]G","A[T>C]T","C[T>C]A","C[T>C]C","C[T>C]G","C[T>C]T","G[T>C]A","G[T>C]C","G[T>C]G","G[T>C]T","T[T>C]A","T[T>C]C","T[T>C]G","T[T>C]T","A[T>G]A","A[T>G]C","A[T>G]G","A[T>G]T","C[T>G]A","C[T>G]C","C[T>G]G","C[T>G]T","G[T>G]A","G[T>G]C","G[T>G]G","G[T>G]T","T[T>G]A","T[T>G]C","T[T>G]G","T[T>G]T")
  
  # read only chr seqnames from VCF, not contigs
  gr <- GenomicRanges::GRanges(GenomeInfoDb::seqinfo(genomeSeq))
  vcf_seqnames <- Rsamtools::headerTabix(vcfFilename)$seqnames 
    if (genome.v=="hg38" || genome.v=="mm10") {
    if(length(intersect(vcf_seqnames,expected_chroms))==0) vcf_seqnames <- paste0("chr",vcf_seqnames)
  }
  
  gr <- GenomeInfoDb::keepSeqlevels(gr,intersect(vcf_seqnames,expected_chroms),pruning.mode = "coarse")
  
  vcf_seqnames <- Rsamtools::headerTabix(vcfFilename)$seqnames
  if (genome.v=="hg38" || genome.v=="mm10") {
    if(length(intersect(vcf_seqnames,expected_chroms))==0) GenomeInfoDb::seqlevels(gr) <- sub("chr", "", GenomeInfoDb::seqlevels(gr))
  }
  
  # load the vcf file
  vcf_data <- VariantAnnotation::readVcf(vcfFilename, genome.v, gr)
  vcf_data <- VariantAnnotation::expand(vcf_data)
  nmutsloaded <- nrow(vcf_data)
  
  # check if there are mutations at all
  if(nmutsloaded==0){
    # return early an empty catalogue
    result <- list()
    result$catalogue <- as.data.frame(matrix(0,nrow = 96,ncol = 1,dimnames = list(mut.order,"catalogue")),stringsAsFactors = F)
    return(result)
  }
  
  #filters failed for each variant
  rd <- SummarizedExperiment::rowRanges(vcf_data)
  selectionvcf <- nchar(as.character(rd$REF))==1 & nchar(as.character(rd$ALT))==1
  vcf_data <- vcf_data[selectionvcf,,drop=F]
  nmutssnv <- nrow(vcf_data)
  
  if(nmutsloaded>nmutssnv){
    message("[vcfToSNVcatalogue warning] the vcf file ",vcfFilename," contains ",nmutsloaded-nmutssnv," indels, which were ignored.")
  }
  
  rd <- SummarizedExperiment::rowRanges(vcf_data)
  info.data <- VariantAnnotation::info(vcf_data)
  
  rgs <- IRanges::ranges(vcf_data)
  starts <- BiocGenerics::start(rgs)
  ends <-  BiocGenerics::end(rgs)
  
  #Check chromosomes exist
  chroms <- GenomeInfoDb::seqnames(vcf_data)

  if (length(chroms)==0){ 
    # return early an empty catalogue
    result <- list()
    result$catalogue <- as.data.frame(matrix(0,nrow = 96,ncol = 1,dimnames = list(mut.order,"catalogue")),stringsAsFactors = F)
    return(result)
  }
  
  if (genome.v=="hg38" || genome.v=="mm10") {
    if(length(intersect(chroms,expected_chroms))==0) chroms <- paste0("chr",chroms)
  }
  
  fxd <- (VariantAnnotation::fixed(vcf_data))
  wt <- as.character(rd$REF)
  mt <- as.character(rd$ALT)
  
  
  barcode <- paste(chroms, '-',starts,'-', mt, sep='')
  
  #get context
  bb <- as.character(BSgenome::getSeq(genomeSeq, chroms, start=starts-1, end=ends-1))
  ba <- as.character(BSgenome::getSeq(genomeSeq, chroms, start=starts+1, end=ends+1))
  wt.ref <- as.character(BSgenome::getSeq(genomeSeq, chroms, start=starts, end=ends))
  triplets <- as.character(BSgenome::getSeq(genomeSeq, chroms, start=starts-1, end=ends+1))

  mut.table <- data.frame(bbef=as.character(bb), wt=as.character(wt), mt=as.character(mt), baft=as.character(ba), stringsAsFactors=FALSE)


  mut.table$pyrwt <- as.character(mut.table$wt)
  mut.table$pyrmut <- as.character(mut.table$mt)
  mut.table$pyrbbef <- as.character(mut.table$bbef)
  mut.table$pyrbaft <- as.character(mut.table$baft)


  # the mutations originally not in pyramidine contex
  not.pyr <- ((wt=='G') | (wt=='A'))
  mut.table$pyrwt[not.pyr] <- as.character(toPyr(mut.table$wt[not.pyr]))
  mut.table$pyrmut[not.pyr] <- as.character(toPyr(mut.table$mt[not.pyr]))
  mut.table$pyrbbef[not.pyr] <- as.character(toPyr(mut.table$baft[not.pyr]))
  mut.table$pyrbaft[not.pyr] <- as.character(toPyr(mut.table$bbef[not.pyr]))

  all.hist <- generateHist(mut.table, normalise=FALSE,mut.order=mut.order)

  muts <- data.frame(chroms=chroms, starts=starts, ends = ends, wt=wt, mt=mt, pyrwt=mut.table$pyrwt , pyrmut=mut.table$pyrmut, pass=TRUE, barcode=barcode,
                 context=paste(mut.table$pyrbbef, '[',mut.table$pyrwt, '>',mut.table$pyrmut , ']', mut.table$pyrbaft,sep=''),
                 tumor.freq=rep(NA,length(chroms)), normal.freq=rep(NA,length(chroms)),
                 tumor.reads=rep(NA,length(chroms)), normal.reads=rep(NA,length(chroms)),
                 tumor.depth=rep(NA,length(chroms)), normal.depth=rep(NA,length(chroms)),
                                                         isSnp =rep(NA,length(chroms)),
                 stringsAsFactors = F)
  
  result<- list()
  result$catalogue <- data.frame(catalogue=all.hist,row.names = names(all.hist))
  result$muts <- muts
  result$mut.table <- mut.table
  result

}
Nik-Zainal-Group/signature.tools.lib documentation built on April 13, 2025, 5:50 p.m.