R/outputSomaticVariants.R

Defines functions expandFlags writeToVCF outputSomaticVariants

Documented in writeToVCF

#prints the somatic variants to an excel sheet.
outputSomaticVariants = function(variants, genome, plotDirectory, cpus=cpus, forceRedo=F, onlyForVEP=F, rareGermline=T) {
  vcfDir = paste0(plotDirectory, '/somatics')
  if ( !file.exists(vcfDir) ) dir.create(vcfDir)

  outfile = paste0(plotDirectory, '/somaticVariants.xls')
  if ( (!file.exists(outfile) | forceRedo) ) {
    somatics = list()
    XLsomatics = list()
    catLog('Printing somatic variants to ', outfile, '.\n', sep='')
    for ( sample in names(variants$variants) ) {
      catLog(sample, '..', sep='')
      q = variants$variants[[sample]]
      somaticP = q$somaticP
      toReturn = which(somaticP > 0)
      if ( !rareGermline ) toReturn = which(somaticP > 0 & (!q$germline | is.na(q$germline)))
      toReturn = toReturn[order(somaticP[toReturn], decreasing=T)]
      q = q[toReturn,]

      vcfFile = paste0(vcfDir, '/', sample, '.vcf')
      if ( !onlyForVEP ) writeToVCF(q, vcfFile, genome=genome)

      start = as.integer(xToPos(q$x, genome))
      end = as.integer(xToPos(q$x, genome))
      variant = q$variant
      reference = q$reference

      refvarstartendList = superFreq:::convertVariantsToVCF(list(reference, variant, start, end))
      reference = refvarstartendList[[1]]
      variant = refvarstartendList[[2]]
      start = refvarstartendList[[3]]
      end = refvarstartendList[[4]]
      
      somatic = data.frame(
        chr=xToChr(q$x, genome),
        start=start,
        end=end,
        reference=reference,
        variant=variant,
        inGene=gsub('.+:', '', q$inGene),
        severity=if ( 'severity' %in% names(q) ) q$severity else rep('na', nrow(q)),
        effect=if ( 'type' %in% names(q) ) q$type else rep('notChecked', nrow(q)),
        f=q$var/q$cov,
        cov=q$cov,
        ref=q$ref,
        var=q$var,
        flag=q$flag,
        pbq=q$pbq,
        pmq=q$pmq,
        psr=q$psr,
        somaticScore=q$somaticP,
        germlineLike=ifelse(is.na(q$germline), 'na', ifelse(q$germline, 'YES', '')),
        dbSNP=ifelse(q$db, 'dbSNP', ''),
        dbMAF=if ( 'dbMAF' %in% names(q) ) ifelse(q$db, q$dbMAF, '') else rep('na', nrow(q)),
        dbValidated=if ( 'dbValidated' %in% names(q) ) ifelse(q$db, q$dbValidated, '') else rep('na', nrow(q)),
        ExAC= if ( 'exac' %in% names(q) ) ifelse(q$exac, 'ExAC', '') else rep('na', nrow(q)),
        ExAC_AF = if ( 'exacAF' %in% names(q) ) ifelse(q$exac, q$exacAF, '') else rep('na', nrow(q)),
        ExAC_Filter = if ( 'exacFilter' %in% names(q) ) ifelse(q$exac, q$exacFilter, '') else rep('na', nrow(q)),
        row.names=rownames(q), stringsAsFactors=F)
      if ( all(moreVEPnames(genome=genome) %in% names(q)) ) {
        catLog('adding more VEP info..')
        somatic = cbind(somatic, q[,moreVEPnames(genome=genome)])
      }
      if ( all(annotationColumns(genome=genome) %in% names(q)) ) {
        catLog('adding more VariantAnnotation info..')
        somatic = cbind(somatic, q[,annotationColumns(genome=genome)])
      }
      #add VAFs and RD from other samples
      if ( length(variants$variants) > 1 ) {
      	otherSamples = names(variants$variants)[names(variants$variants) != sample]
      	for ( otherS in otherSamples ) {
      		q2 = variants$variants[[otherS]][rownames(q),]
      		vafs = cbind(q2$var/q2$cov, q2$cov, q2$var)
			colnames(vafs) = paste0(otherS, c('_VAF', '_RD', '_var'))
      		somatic = cbind(somatic, vafs)
      	}
      }
      
      if ( 'severity' %in% names(q) )  ord = order(q$severity + 10*ifelse(is.na(q$germline), 0, q$germline))
      else ord = order(10*q$germline)
      somatic = somatic[ord,]
      somatics[[sample]] = somatic
      if ( nrow(somatic) > 65000 ) warning('Truncating .xls somatic variants: too many rows.')
      XLsomatics[[sample]] = somatic[1:min(nrow(somatic), 65000),]
    }
    names(XLsomatics) = make.unique(substring(names(XLsomatics), 1, 29))
    if ( !onlyForVEP ) catLog('writing to xls...')
    if ( !onlyForVEP ) WriteXLS('XLsomatics', outfile)
    
    #probably want to swap this to .xlsx instead, for zip compression and much higher caps on row and column numbers
    #look into library("xlsx") for example?

	#this outputs all variants (beyond 65k) to a single .csv.
	#purpose was to simplify downstream analysis without hving to parse
	#a multi-tab .xls that may not even have all variants.
	#however, this file is rarely used afaik
	#and for some reason it often causes aout-of-memory crashes.
	#hence removed. Use .vcfs below for automated downstream analysis, or Rdata.
    #if ( !onlyForVEP ) catLog('Writing to .csv...')
    #for ( sample in names(somatics) ) {
    #  somatics[[sample]]$sample = rep(sample, nrow(somatics[[sample]]))
    #}
    #allSomatics = do.call(rbind, somatics)
    #if ( !onlyForVEP ) write.csv(allSomatics, gsub('.xls$', '.csv', outfile))
    #if ( !onlyForVEP ) catLog('done!\n')

    catLog('Outputting to directory ', vcfDir, '..')
    for ( name in names(somatics) ) {
      somatic = somatics[[name]]
      outfile = paste0(vcfDir, '/', name, '.txt')
      catLog(basename(outfile), '..', sep='')
      if ( nrow(somatic) > 0 ) {
        options(scipen=999)

        insertions = grepl('\\+', rownames(somatic))
        if ( any(insertions) ) {
          somatic$reference[insertions] = '-'
          somatic$start[insertions] = somatic$start[insertions]+1
          somatic$variant[insertions] = gsub('\\+', '', somatic$variant[insertions])
        }

        mx = cbind(as.character(somatic$chr),
          as.character(somatic$start), as.character(somatic$end),
          paste0(as.character(somatic$reference), '/', as.character(somatic$variant)),
          rep('+', nrow(somatic)), as.character(1:nrow(somatic)))
        write(t(mx), file=outfile, sep='\t', ncolumns=ncol(mx))
        options(scipen=5)
      }
      else
        write('No somatic mutations detected.', file=outfile, sep='\t')
    }
    catLog('done.\n')
  }
}


#' exports variants to VCF
#'
#' @param q A variant data frame from superFreq.
#' @param vcfFile The path to the output file.
#' @param genome The genome, such as 'hg19', 'hg38' or 'mm10'. Defaults to 'hg19'.
#' @param SNVonly boolean. Set to TRUE to only output SNVs, not indels. Defaults to FALSE.
#' @param addSomaticP boolean. Set to TRUE to include a column with the somaticP score from superFreq. Defaults to FALSE.
#'
#' @details This function outputs superFreq variants to a VCF for access from other software.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' data = loadData(Rdirectory)
#' q = data$allVariants$variants$variants$mySample
#' writeToVCF(q, 'mySample.superFreq.vcf')
#' }
writeToVCF = function(q, vcfFile, genome='hg19', snvOnly=F, addSomaticP=F) {
  if ( snvOnly ) q = q[q$variant %in% c('A', 'T', 'C', 'G'),]

  refvarstartendList = convertVariantsToVCF(list(q$reference, q$variant, xToPos(q$x, genome), xToPos(q$x, genome)))
  reference = refvarstartendList[[1]]
  variant = refvarstartendList[[2]]
  start = refvarstartendList[[3]]
  end = refvarstartendList[[4]]
  
  preambula = c('##fileformat=VCFv4.0',
    '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tCOVERAGE\tVARIANTREADS\tVAF')
  if ( addSomaticP )
  preambula = c('##fileformat=VCFv4.0',
    paste0('##reference=', genome),
    '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
    '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
    '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO')
  chrom = xToChr(q$x, genome)
  pos = as.integer(xToPos(q$x, genome))
  ID = rownames(q)
  ref = reference
  alt = variant
  qual = pmin(60, -log10(1-q$somaticP)*10)
  filter = gsub('^$', 'PASS', expandFlags(q$flag))
  info = paste0('DP=', q$cov, ';AF=', ifelse(q$cov > 0, q$var/q$cov, 0))

  out = cbind(chrom, pos, ID, ref, alt, qual, filter, info)
  if ( addSomaticP )
      out = cbind(chrom, pos, ID, ref, alt, qual, filter, info, q$somaticP)
  body = apply(out, 1, function(strs) do.call(paste, c(as.list(strs), sep='\t')))
  body = do.call(paste, c(as.list(preambula), as.list(body), sep='\n'))

  write.table(body, file=vcfFile, row.names=F, col.names=F, quote=F)
}

expandFlags = function(flags) {
  flags = gsub('(Rep)([A-Z]|$)', 'Repeatregion:\\2', flags)
  flags = gsub('(Bq)([A-Z]|$)', 'Basecallquality:\\2', flags)
  flags = gsub('(Mq)([A-Z]|$)', 'Mappingquality:\\2', flags)
  flags = gsub('(Svr)([A-Z]|$)', 'Singlevariantread:\\2', flags)
  flags = gsub('(Nnc)([A-Z]|$)', 'Normalnoiseconsistent:\\2', flags)
  flags = gsub('(Nnn)([A-Z]|$)', 'Normalnoisenonconsistent:\\2', flags)
  flags = gsub('(Vn)([A-Z]|$)', 'Variablenormals:\\2', flags)
  flags = gsub('(Mv)([A-Z]|$)', 'Minorvariant:\\2', flags)
  flags = gsub('(St)([A-Z]|$)', 'Stutter:\\2', flags)
  flags = gsub('(Sb)([A-Z]|$)', 'Strandbias:\\2', flags)
  flags = gsub('(Pa)([A-Z]|$)', 'Palindromic:\\2', flags)
  flags = gsub('(Mc)([A-Z]|$)', 'Manycopies:\\2', flags)
  flags = gsub('(Pn)([A-Z]|$)', 'Polymorphicnormals:\\2', flags)
  flags = gsub('(Nr)([A-Z]|$)', 'Noisyregion:\\2', flags)

  return(flags)
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.