R/runVEP.R

Defines functions getCosmicCensusDensity getCosmicCounts moreVEPnames addNullAnnotation getMoreVEPinfo postAnalyseVEP severityToType typeToSeverity genomeToAssembly runVEP

Documented in getCosmicCounts getMoreVEPinfo moreVEPnames postAnalyseVEP runVEP severityToType typeToSeverity

#' Run VEP on the provided variants
#'
#' @param variants variants: The variants to be VEPed.
#' @param plotDir character: The plot directory where outputSomatics have been run.
#' @param cpus integer: The number of cpus to be used as most.
#' @param forceRedoVEP Logical: if VEP should be rerun even if saved data already exists.
#'
#' @details This function calls VEP on the output from outputSomaticVariants. For this, VEP needs to be callable by system('vep').
runVEP = function(variants, plotDir, cpus=1, genome='hg19', vepCall='vep', forceRedoVEP=F, rareGermline=T) {
  catLog('VEP-ing..')
  dir = paste0(plotDir, '/somatics/')
  for ( name in names(variants$variants) ) {
    infile = paste0(dir, '/', name, '.txt')
    VEPfile = paste0(dir, '/', name, '.VEP.txt')
    if ( !file.exists(VEPfile) | forceRedoVEP ) {
      wd = getwd()
      catLog('Moving to ', dir, '\n')
      setwd(dir)
      q = variants$variants[[name]]
      nVariants = sum(q$somaticP > 0 & !is.na(q$somaticP))
      if ( !rareGermline )
        nVariants = sum(q$somaticP > 0 & (!q$germline | is.na(q$germline)) & !is.na(q$somaticP))
      catLog(name, ': ', nVariants, ' variants.\n', sep='')
      if ( nVariants == 0 ) {
        catLog('Moving back to ', wd, '\n')
        setwd(wd)
        next
      }
      assembly = genomeToAssembly(genome)
      #seems like veps format of the version output in the --help changed, so skipping version check and assuming 89...
      if ( FALSE ) {
        catLog('Checking VEP version.\n')
        catLog(paste0(vepCall, ' --help'), '\n')
        vepHelp = system(paste0(vepCall, ' --help'), intern=T)
        versionLine = strsplit(vepHelp[grep('^version', vepHelp)], ' ')[[1]]
        vepVersion = as.numeric(versionLine[length(versionLine)])
      }
      vepVersion = 89
      if ( vepVersion < 76 & genome == 'hg38' ) {
        warning('VEP seems to be pre version 76, which isnt supported for hg38. Please update VEP. Running without annotation for now.')
        return(variants)
      }
      if ( vepVersion < 76 ) {
        catLog('Using pre-76 VEP version call. Not supported, but trying...\n')
        if ( cpus == 1 )
          call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite')
        else
          call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --fork ', cpus)        
      }
      else {
        if ( cpus == 1 )
          call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --assembly ', assembly)
        else
          call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --assembly ', assembly, ' --fork ', cpus)
      }
      if ( genome == 'mm10' ) call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --fork ', cpus, ' --species mus_musculus')
      catLog(call, '\n')
      call = paste0(call, '\n')
      systemRet = system(call, intern=T)
      
      #not sure if the current VEP version use the word "Finished", so removing this warning.
      #if ( !any(grepl('Finished', systemRet)) ) warning('VEP run didnt finish cleanly!')
      
      catLog('Moving back to ', wd, '\n')
      setwd(wd)
    }
  }
  catLog('done.\n')
  
  catLog('Importing VEP results:\n')
  for ( name in names(variants$variants) ) {
    catLog(name, ': ', sep='')
    if ( sum(variants$variants[[name]]$somaticP > 0 & !is.na(variants$variants[[name]]$somaticP)) == 0 ) {
      catLog('no somatic variants.\n')
      variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
      variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
      next
    }
    VEPfile = paste0(dir, '/', name, '.VEP.txt')
    VEPdata = try(read.table(VEPfile, fill=T, header=F, colClasses='character'), silent=T)
    if ( inherits(VEPdata, 'try-error') ) {
      catLog('failed to read VEP file ', VEPfile,'.\n')
      variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
      variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
    }
    else {
      ID = as.numeric(as.factor(VEPdata$V1))
      chr = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[1])
      pos = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[2])
      end = as.numeric(gsub('^.*-', '',pos))
      pos = as.numeric(gsub('-.*$', '',pos))
      var = as.character(VEPdata$V3)
      deletions = grepl('-', var)
      if ( any(deletions) ) pos[deletions] = pos[deletions] - 1
      type = as.character(VEPdata$V7)
      sev = sapply(type, typeToSeverity)
      x = chrToX(chr, pos, genome=genome)

      lastCol = strsplit(as.character(VEPdata$V14), ';')
      polyPhen = sapply(lastCol, function(strs) {
        if ( any(grepl('PolyPhen=', strs)) ) return(as.numeric(gsub('\\)$', '', gsub('^.*\\(', '', strs[grep('PolyPhen=',strs)[1]]))))
        else return(0.05)
        })
      
      namevar = var
      namevar[nchar(namevar) > 1] = paste0('+', substring(var[nchar(var) > 1], 2))
      namevar[namevar == '-'] = paste0('-', pmax(1, as.numeric(end)[namevar == '-']-as.numeric(pos)[namevar == '-']))
      rowNames = paste0(x, namevar)
      qNames = rep('', length(unique(ID)))
      qNames[ID] = rowNames
      
      #find most severe effect
      mostSev = rep('unkown', length(unique(ID)))
      sevScore = rep(100, length(unique(ID)))
      for ( i in 1:nrow(VEPdata) ) {
        sevI = sev[i]
        IDI = ID[i]
        if ( sevI - polyPhen[i] < sevScore[IDI] ) {
          sevScore[IDI] = sevI - polyPhen[i]
          mostSev[IDI] = severityToType(sevI)
        }
      }
      
      #add effect and severity to q
      variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
      variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
      
      variants$variants[[name]][qNames,]$type = mostSev
      variants$variants[[name]][qNames,]$severity = sevScore
      catLog(length(mostSev), 'VEPed variants.\n')
    }
  }
  catLog('done.\n')
  return(variants)
}

genomeToAssembly = function(genome) {
  if ( genome == 'hg19' ) return('GRCh37')
  if ( genome == 'hg38' ) return('GRCh38')
  return('')
}

#' Ranks the variant effects
#'
#' @param type character: The effect.
#'
#' @details Returns a low score for severe effects, higher for less severe, 100 for unknown. 11 and below alters the protein.
typeToSeverity = function(type, annotationMethod='VEP') {
  if ( annotationMethod == 'VariantAnnotation' ) return(VAconsequenceToSeverityRank(type))
  
  #first change some (older? newer?) notation into the ensemble SO terms
  type = gsub('non_coding_exon_variant', 'non_coding_transcript_exon_variant', type)
  type = gsub('nc_transcript_variant', 'non_coding_transcript_exon_variant', type)

  if ( grepl('transcript_ablation', type) ) return(1)
  if ( grepl('splice_acceptor_variant', type) ) return(2)
  if ( grepl('splice_donor_variant', type) ) return(3)
  if ( grepl('stop_gained', type) ) return(4)
  if ( grepl('frameshift_variant', type) ) return(5)
  if ( grepl('stop_lost', type) ) return(6)
  if ( grepl('start_lost', type) ) return(7)
  if ( grepl('initiator_codon_variant', type) ) return(7)
  if ( grepl('transcript_amplification', type) ) return(8)
  if ( grepl('inframe_insertion', type) ) return(9)
  if ( grepl('inframe_deletion', type) ) return(10)
  if ( grepl('missense_variant', type) ) return(11)
  if ( grepl('protein_altering_variant', type) ) return(12)
  if ( grepl('splice_region_variant', type) ) return(13)
  if ( grepl('incomplete_terminal_codon_variant', type) ) return(14)
  if ( grepl('stop_retained_variant', type) ) return(15)
  if ( grepl('synonymous_variant', type) ) return(16)
  if ( grepl('coding_sequence_variant', type) ) return(17)
  if ( grepl('mature_miRNA_variant', type) ) return(18)
  if ( grepl('5_prime_UTR_variant', type) ) return(19)
  if ( grepl('3_prime_UTR_variant', type) ) return(20)
  if ( grepl('non_coding_transcript_exon_variant', type) ) return(21)
  if ( grepl('intron_variant', type) ) return(22)
  if ( grepl('NMD_transcript_variant', type) ) return(23)
  if ( grepl('non_coding_transcript_variant', type) ) return(24)
  if ( grepl('upstream_gene_variant', type) ) return(25)
  if ( grepl('downstream_gene_variant', type) ) return(26)
  if ( grepl('TFBS_ablation', type) ) return(27)
  if ( grepl('TFBS_amplification', type) ) return(28)
  if ( grepl('TF_binding_site_variant', type) ) return(29)
  if ( grepl('regulatory_region_ablation', type) ) return(30)
  if ( grepl('regulatory_region_amplification', type) ) return(31)
  if ( grepl('regulatory_region_variant', type) ) return(32)
  if ( grepl('feature_elongation', type) ) return(33)
  if ( grepl('feature_truncation', type) ) return(34)
  if ( grepl('intergenic_variant', type) ) return(35)
  
  if ( type == '-' ) return(100)
  if ( grepl('unknown', type) ) return(100)
  catLog('Dont know about the mutation type: ', type, '\n')
  return(100)
}

#' The variant effect as function of severity
#'
#' @param severity integer: The severity rank.
#'
#' @details Translated back from severity rank to the variant effect.
severityToType = function(severity, annotationMethod='VEP') {
  if ( annotationMethod == 'VariantAnnotation' ) return(VAseverityToConsequence(severity))
  
  if ( severity == 1 ) return('transcript_ablation')
  if ( severity == 2 ) return('splice_acceptor_variant')
  if ( severity == 3 ) return('splice_donor_variant')
  if ( severity == 4 ) return('stop_gained')
  if ( severity == 5 ) return('frameshift_variant')
  if ( severity == 6 ) return('stop_lost')
  if ( severity == 7 ) return('start_lost')
  if ( severity == 8 ) return('transcript_amplification')
  if ( severity == 9 ) return('inframe_insertion')
  if ( severity == 10 ) return('inframe_deletion')
  if ( severity == 11 ) return('missense_variant')
  if ( severity == 12 ) return('protein_altering_variant')
  if ( severity == 13 ) return('splice_region_variant')
  if ( severity == 14 ) return('incomplete_terminal_codon_variant')
  if ( severity == 15 ) return('stop_retained_variant')
  if ( severity == 16 ) return('synonymous_variant')
  if ( severity == 17 ) return('coding_sequence_variant')
  if ( severity == 18 ) return('mature_miRNA_variant')
  if ( severity == 19 ) return('5_prime_UTR_variant')
  if ( severity == 20 ) return('3_prime_UTR_variant')
  if ( severity == 21 ) return('non_coding_transcript_exon_variant')
  if ( severity == 22 ) return('intron_variant')
  if ( severity == 23 ) return('NMD_transcript_variant')
  if ( severity == 24 ) return('non_coding_transcript_variant')
  if ( severity == 25 ) return('upstream_gene_variant')
  if ( severity == 26 ) return('downstream_gene_variant')
  if ( severity == 27 ) return('TFBS_ablation')
  if ( severity == 28 ) return('TFBS_amplification')
  if ( severity == 29 ) return('TF_binding_site_variant')
  if ( severity == 30 ) return('regulatory_region_ablation')
  if ( severity == 31 ) return('regulatory_region_amplification')
  if ( severity == 32 ) return('regulatory_region_variant')
  if ( severity == 33 ) return('feature_elongation')
  if ( severity == 34 ) return('feature_truncation')
  if ( severity == 35 ) return('intergenic_variant')
  if ( severity == 100 ) return('unknown')
  return('unknown')
}



#' Runs VEP on a R and plot directory that superFreq has been run on.
#'
#' @param outputDirectories named list with Rdirectory and plotDirectory.
#' @param inputFiles named list as in analyse.
#' @param genome character: such as 'hg19', 'hg38' or 'mm10'
#' @param cpus integer. maximum number of parallel threads.
#' @param cosmicDirectory character: path to COSMIC directory. (optional)
#' @param forceRedo boolean: if TRUE, previously saved data is ignored and overwritten.
#'
#' @export
#' @details Runs VEP on the variants, and saves the variants (retrivable with loadData) with the addition
#'          information from VEP, and COSMIC if provided.
postAnalyseVEP = function(outputDirectories, inputFiles=NA, metaData=NA, genome='hg19', cpus=1, cosmicDirectory='', vepCall='vep', forceRedo=F) {
  Rdirectory = outputDirectories$Rdirectory
  plotDirectory = outputDirectories$plotDirectory
  data = loadData(Rdirectory)
  if ( !('allVariants' %in% names(data)) ) {
    warning('Cant find a saved allVariants.\n')
    return()
  }
  
  if ( genome %in% c('hg19', 'hg38') & 'cosmic' %in% names(data$allVariants$variants$variants[[1]]) & !forceRedo ) return()
  if ( genome == 'mm10' & 'isCCGD' %in% names(data$allVariants$variants$variants[[1]]) & !forceRedo ) return()

  backup = paste0(Rdirectory, '/allVariantsPreVEP.Rdata')
  if ( !file.exists(backup) ) {
    catLog('Backing up variants (in case the vep run goes wrong) to', backup, '\n')
    allVariantsPreVEP = data$allVariants
    if ( !is.null(allVariantsPreVEP) )
      save('allVariantsPreVEP', file=backup)
  }

  variants = data$allVariants$variants
  if ( !('allVariants' %in% names(data)) & 'variants' %in% names(data) )
    variants = data$variants
  if ( inherits(metaData, 'data.frame') )
    sampleMetaData = metaData
  else if ( inherits(inputFiles, 'list') )
    sampleMetaData = importSampleMetaData(inputFiles$metaDataFile)
  else
    stop('Neither inputFiles nor metaData was defined.')
  normals = as.logical(gsub('YES', 'T', gsub('NO', 'F', sampleMetaData$NORMAL)))
  names = make.names(sampleMetaData$NAME, unique=T)
  names(normals) = names
  individuals = sampleMetaData$INDIVIDUAL
  timePoints = sampleMetaData$TIMEPOINT
  names(timePoints) = names(individuals) = names
  samplePairs = metaToSamplePairs(names, individuals, normals)
  timeSeries = metaToTimeSeries(names, individuals, normals)

  outputSomaticVariants(variants, genome=genome, plotDirectory=plotDirectory, cpus=cpus, forceRedo=forceRedo)
  variants = runVEP(variants, plotDirectory, cpus=cpus, genome=genome, vepCall=vepCall, forceRedoVEP=forceRedo)
  variants = getMoreVEPinfo(variants, plotDirectory, genome=genome, cosmicDirectory=cosmicDirectory)
  allVariants = data$allVariants
  allVariants$variants = variants

  allVariantSaveFile = paste0(Rdirectory, '/allVariants.Rdata')
  catLog('Saving fully annotated variants to', allVariantSaveFile, '...')
  save('allVariants', file=allVariantSaveFile)
  catLog('done.\n')
  storiesSaveFile = paste0(Rdirectory, '/stories.Rdata')
  catLog('Replacing fully annotated variants in', storiesSaveFile, '...')
  load(storiesSaveFile)
  stories$variants = variants
  save('stories', file=storiesSaveFile)
  catLog('done.\n')

  outputSomaticVariants(variants, genome, plotDirectory, cpus=cpus, forceRedo=T)
  makeSNPprogressionPlots(variants, timeSeries=timeSeries, normals = normals, plotDirectory=plotDirectory,
                          genome=genome, forceRedo=T, maxRowCluster=2000)
  makeRiverPlots(data$stories$stories, variants, genome=genome, cpus=cpus, plotDirectory=plotDirectory, forceRedo=T)
  makeScatterPlots(variants, samplePairs, timePoints, plotDirectory, genome=genome, cpus=cpus, forceRedo=T)
  makeCloneScatterPlots(variants, data$stories$stories, samplePairs, individuals, timePoints,
                        plotDirectory, genome=genome, cpus=cpus, forceRedo=T)
  outputNewVariants(variants, samplePairs, genome, plotDirectory, cpus=cpus, forceRedo=T)
}



#' Import more information about the variants
#'
#' @param variants variants: The variants
#' @param plotDirectory character: The plot directory from the analyse call.
#'
#' @details Extract almost all the information from the VEP run, and cross-checks with COSMIC data as well.
#'          getCosmicCounts is called by this function, see details of that for requirements.
getMoreVEPinfo = function(variants, plotDirectory, genome='hg19', cosmicDirectory='') {
  dir = paste0(plotDirectory, '/somatics')
  catLog('Importing more VEP info:\n')
  for ( name in names(variants$variants) ) {
    catLog(name, '.. ')
    if ( sum(variants$variants[[name]]$somaticP > 0 & !is.na(variants$variants[[name]]$somaticP)) == 0 ) {
      catLog('no somatic variants.\n')
      variants$variants[[name]] = addNullAnnotation(variants$variants[[name]], genome=genome)
      next
    }
    else {
      VEPfile = paste0(dir, '/', name, '.VEP.txt')
      VEPdata = try(read.table(VEPfile, fill=T, header=F, colClasses='character'), silent=T)
      if ( inherits(VEPdata, 'try-error') )
        catLog('failed to read VEP file ', VEPfile,'.\n')
      
      ID = as.numeric(as.factor(VEPdata$V1))
      var = as.character(VEPdata$V3)
      chr = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[1])
      pos = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[2])
      end = as.numeric(gsub('^.*-', '',pos))
      pos = as.numeric(gsub('-.*$', '',pos))
      deletions = grepl('-', var)
      if ( any(deletions) ) pos[deletions] = pos[deletions] - 1
      type = as.character(VEPdata$V7)
      sev = sapply(type, typeToSeverity)
      x = chrToX(chr, pos, genome)
      AApos = VEPdata$V10
      AAbefore = gsub('\\/.*$', '', VEPdata$V11)
      AAafter = gsub('^.*\\/', '', VEPdata$V11)

      secondLastCol = strsplit(as.character(VEPdata$V13), ',')
      lastCol = strsplit(as.character(VEPdata$V14), ';')
      polyPhen = sapply(lastCol, function(strs) {
        if ( any(grepl('PolyPhen=', strs)) ) return(gsub('^PolyPhen=', '', strs[grep('PolyPhen=',strs)[1]]))
        else return('')
        })
      numericPolyPhen = sapply(lastCol, function(strs) {
        if ( any(grepl('PolyPhen=', strs)) )
          return(as.numeric(gsub('\\)$', '', gsub('^.*\\(', '', strs[grep('PolyPhen=',strs)[1]]))))
        else return(0.0)
        })

      symbol = sapply(lastCol, function(strs) {
        if ( any(grepl('SYMBOL=', strs)) ) return(gsub('^SYMBOL=', '', strs[grep('SYMBOL=',strs)[1]]))
        else return('')
        })
      exon = sapply(lastCol, function(strs) {
        if ( any(grepl('EXON=', strs)) ) return(gsub('^EXON=', '', strs[grep('EXON=',strs)[1]]))
        else return('')
      })
      sift = sapply(lastCol, function(strs) {
        if ( any(grepl('SIFT=', strs)) ) return(gsub('^SIFT=', '', strs[grep('SIFT=',strs)[1]]))
        else return('')
      })
      domain = sapply(lastCol, function(strs) {
        if ( any(grepl('DOMAINS=', strs)) ) return(gsub('\\,.*$', '', gsub('^DOMAINS=', '', strs[grep('DOMAINS=',strs)[1]])))
        else return('')
      })

      if ( genome %in% c('hg19', 'hg38') ) {
        cosmic = sapply(secondLastCol, function(strs) {
          if ( any(grepl('COSM', strs)) ) return(strs[grep('COSM',strs)[1]])
          else return('')
        })
        cosmicCounts = getCosmicCounts(cosmic, cosmicDirectory=cosmicDirectory, genome=genome)
        isCosmicCensus = cosmicCounts$found
        cosmicCounts = getCosmicCounts(cosmic, cosmicDirectory=cosmicDirectory, onlyCensus=F, genome=genome)
        cosmicVariantDensity = cosmicCounts$variantDensity
        cosmicGeneDensity = cosmicCounts$geneDensity
        
        censusDensity = getCosmicCensusDensity(cosmicDirectory=cosmicDirectory)
        noHitCensus = !isCosmicCensus & symbol %in% names(censusDensity) & sev <= 11
        if ( any(noHitCensus) ) {
          isCosmicCensus[noHitCensus] = T
          cosmicGeneDensity[noHitCensus] = censusDensity[symbol[noHitCensus]]
        }
      }
      if ( genome == 'mm10' ) {
        catLog('adding CCGD data..')
        CCGDdata = read.table(paste0(cosmicDirectory, '/CCGD_export.csv'), sep=',', header=T, stringsAsFactors=F)

        censusGenes = unique(CCGDdata$Mouse.Symbol)
        isCCGD = symbol %in% censusGenes

        CCGDsummary = sapply(censusGenes, function(gene) {
          subData = CCGDdata[CCGDdata$Mouse.Symbol == gene,]
          studies = subData$Studies[1]
          humanGene = subData$Human.Symbol[1]
          
          cancerTypes = sort(table(subData$Cancer.Type), decreasing=T)
          names(cancerTypes) = gsub(' Cancer', '',names(cancerTypes))
          cancerTypes = gsub(';$','',do.call(paste0, as.list(paste0(names(cancerTypes), ":", cancerTypes, ';'))))

          isInCosmic = subData$COSMIC[1] == 'Yes'
          isInCGC = subData$CGC[1] == 'Yes'

          ranks = table(c('A', 'B', 'C', 'D', subData$Relative.Rank))
          ranks[c('A', 'B', 'C', 'D')] = ranks[c('A', 'B', 'C', 'D')] - 1
          names(ranks) = gsub(' Cancer', '',names(ranks))
          ranks = gsub(';$','',do.call(paste0, as.list(paste0(names(ranks), ":", ranks, ';'))))

          return(c('studies'=studies, 'cancerTypes'=cancerTypes, 'cosmic'=isInCosmic,
                   'cgc'=isInCGC, 'ranks'=ranks, 'humanGene'=humanGene))
        })
        CCGDsummary = as.data.frame(t(CCGDsummary), stringsAsFactors=F)
        rownames(CCGDsummary) = censusGenes
        CCGDsummary$studies = as.numeric(CCGDsummary$studies)
        CCGDsummary$cosmic = as.logical(CCGDsummary$cosmic)
        CCGDsummary$cgc = as.logical(CCGDsummary$cgc)
        
        CCGDstudies = rep(0, length(symbol))
        CCGDcancerTypes = rep('', length(symbol))
        CCGDcosmic = rep('', length(symbol))
        CCGDcgc = rep('', length(symbol))
        CCGDranks = rep('', length(symbol))

        CCGDstudies[isCCGD] = CCGDsummary[symbol[isCCGD],]$studies
        CCGDcancerTypes[isCCGD] = CCGDsummary[symbol[isCCGD],]$cancerTypes
        CCGDcosmic[isCCGD] = ifelse(CCGDsummary[symbol[isCCGD],]$cosmic, 'YES!', 'no')
        CCGDcgc[isCCGD] = ifelse(CCGDsummary[symbol[isCCGD],]$cgc, 'YES!', 'no')
        CCGDranks[isCCGD] = CCGDsummary[symbol[isCCGD],]$ranks

        catLog('found ', sum(isCCGD), ' entries.\n', sep='')
      }
      
      namevar = var
      namevar[nchar(namevar) > 1] = paste0('+', substring(var[nchar(var) > 1], 2))
      namevar[namevar == '-'] = paste0('-', pmax(1, as.numeric(end)[namevar == '-']-as.numeric(pos)[namevar == '-']))
      rowNames = paste0(x, namevar)
      qNames = rep('', length(unique(ID)))
      qNames[ID] = rowNames
      
      #find most severe effect
      mostSev = rep('unkown', length(unique(ID)))
      sevScore = rep(100, length(unique(ID)))
      polyPhenRet = rep('', length(unique(ID)))
      exonRet = rep('', length(unique(ID)))
      siftRet = rep('', length(unique(ID)))
      symbolRet = rep('', length(unique(ID)))
      AAposRet = rep('', length(unique(ID)))
      AAbeforeRet = rep('', length(unique(ID)))
      AAafterRet = rep('', length(unique(ID)))
      domainRet = rep('', length(unique(ID)))
      cosmicRet = rep('', length(unique(ID)))
      isCosmicCensusRet = rep(F, length(unique(ID)))
      cosmicVariantDensityRet = rep(0, length(unique(ID)))
      cosmicGeneDensityRet = rep(0, length(unique(ID)))
      CCGDstudiesRet = rep(0, length(unique(ID)))
      CCGDcancerTypesRet = rep('', length(unique(ID)))
      CCGDcosmicRet = rep(F, length(unique(ID)))
      CCGDcgcRet = rep(F, length(unique(ID)))
      CCGDranksRet = rep('', length(unique(ID)))
      for ( i in 1:nrow(VEPdata) ) {
        sevI = sev[i]
        IDI = ID[i]
        if ( sevI - numericPolyPhen[i] < sevScore[IDI] ) {
          sevScore[IDI] = sevI - numericPolyPhen[i]
          mostSev[IDI] = severityToType(sevI)

          polyPhenRet[IDI] = polyPhen[i]
          exonRet[IDI] = exon[i]
          siftRet[IDI] = sift[i]
          symbolRet[IDI] = symbol[i]
          AAposRet[IDI] = AApos[i]
          AAbeforeRet[IDI] = AAbefore[i]
          AAafterRet[IDI] = AAafter[i]
          domainRet[IDI] = domain[i]
          if ( genome %in% c('hg19', 'hg38') ) {
            cosmicRet[IDI] = cosmic[i]
            isCosmicCensusRet[IDI] = isCosmicCensus[i]
            cosmicVariantDensityRet[IDI] = cosmicVariantDensity[i]
            cosmicGeneDensityRet[IDI] = cosmicGeneDensity[i]
          }
          if ( genome == 'mm10' ) {
            CCGDstudiesRet[IDI] = CCGDstudies[i]
            CCGDcancerTypesRet[IDI] = CCGDcancerTypes[i]
            CCGDcosmicRet[IDI] = CCGDcosmic[i]
            CCGDranksRet[IDI] = CCGDcgc[i]
            CCGDranksRet[IDI] = CCGDranks[i]
          }
        }
      }
      
      #add effect and severity to q
      variants$variants[[name]] = addNullAnnotation(variants$variants[[name]], genome=genome)
      
      variants$variants[[name]][qNames,]$severity = sevScore
      variants$variants[[name]][qNames,]$type = mostSev
      variants$variants[[name]][qNames,]$polyPhen = polyPhenRet
      variants$variants[[name]][qNames,]$sift = siftRet
      for ( sampleName in names(variants$variants) ) {
        present = qNames %in% rownames(variants$variants[[sampleName]])
        variants$variants[[sampleName]][qNames[present],]$inGene[symbolRet[present] != ''] = symbolRet[present][symbolRet[present] != '']
      }
      variants$variants[[name]][qNames,]$exon = exonRet
      variants$variants[[name]][qNames,]$AApos = AAposRet
      variants$variants[[name]][qNames,]$AAbefore = AAbeforeRet
      variants$variants[[name]][qNames,]$AAafter = AAafterRet
      variants$variants[[name]][qNames,]$domain = domainRet
      if ( genome %in% c('hg19', 'hg38') ) {
        variants$variants[[name]][qNames,]$cosmic = cosmicRet
        variants$variants[[name]][qNames,]$isCosmicCensus = isCosmicCensusRet
        variants$variants[[name]][qNames,]$cosmicVariantMPM = cosmicVariantDensityRet
        variants$variants[[name]][qNames,]$cosmicGeneMPMPB = cosmicGeneDensityRet
      }
      if ( genome == c('mm10') ) {
        variants$variants[[name]][qNames,]$CCGDstudies = CCGDstudiesRet
        variants$variants[[name]][qNames,]$CCGDcancerTypes = CCGDcancerTypesRet
        variants$variants[[name]][qNames,]$CCGDcosmic = CCGDcosmicRet
        variants$variants[[name]][qNames,]$CCGDcgc = CCGDcgcRet
        variants$variants[[name]][qNames,]$CCGDranks = CCGDranksRet
      }
      catLog(length(mostSev), 'VEPed variants.\n')
    }
  }
  catLog('done.\n')
  return(variants)
}

addNullAnnotation = function(q, genome='hg19') {
  q$severity = rep(100, nrow(q))
  q$type = rep('unknown', nrow(q))
  q$polyPhen = rep('', nrow(q))
  q$sift = rep('', nrow(q))
  q$exon = rep('', nrow(q))
  q$AApos = rep('', nrow(q))
  q$AAbefore = rep('', nrow(q))
  q$AAafter = rep('', nrow(q))
  q$domain = rep('', nrow(q))
  if ( genome %in% c('hg19', 'hg38') ) {
    q$cosmic = rep('', nrow(q))
    q$isCosmicCensus = rep(F, nrow(q))
    q$cosmicVariantMPM = rep(0, nrow(q))
    q$cosmicGeneMPMPB = rep(0, nrow(q))
  }
  if ( genome == 'mm10' ) {
    q$isCCGD = rep(F, nrow(q))
    q$CCGDstudies = rep(0, nrow(q))
    q$CCGDcancerTypes = rep('', nrow(q))
    q$CCGDcosmic = rep(F, nrow(q))
    q$CCGDcgc = rep(F, nrow(q))
    q$CCGDranks = rep('', nrow(q))
  }
  return(q)
}

#' internal function
#'
#' @details Just remembers which columns are added by getMoreVEPinfo
moreVEPnames = function(genome='hg19') {
  if ( genome %in% c('hg19', 'hg38') ) return(c('polyPhen', 'sift', 'exon', 'AApos', 'AAbefore', 'AAafter', 'domain', 'cosmic', 'isCosmicCensus', 'cosmicVariantMPM', 'cosmicGeneMPMPB'))
  if ( genome == 'mm10' ) return(c('polyPhen', 'sift', 'exon', 'AApos', 'AAbefore', 'AAafter', 'domain', 'isCCGD', 'CCGDstudies', 'CCGDcancerTypes', 'CCGDcosmic', 'CCGDcgc', 'CCGDranks'))
}  



#' Retrieves information about cosmic variant IDs
#'
#' @param cosmic character: The cosmic IDs.
#' @param cosmicDirectory character: The directory where the cosmic files CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv are located.
#' @param onlyCensus logical: Restrict to cosmic census genes.
#'
#' @details For each cosmic ID, return whether it is seen in a census gene, and the mutations per million tumors (MPM) for that variant, as well as the average MPM over the gene (mutations per million tumors per basepair, MPMPB).
getCosmicCounts = function(cosmic, cosmicDirectory='/wehisan/general/academic/grp_leukemia_genomics/data/resources/COSMIC', onlyCensus=T, genome='hg19') {
  if ( cosmicDirectory == '' ) {
    warning('COSMIC directory not specified. To access more cosmic data, download CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv from cosmic, put them in a directory (dont change the names of the files please), and provide the directory to postAnalyseVEP, getMoreVEPinfo or getCosmicCounts.')
    return(list(cosmic=cosmic, found=rep(F, length(cosmic)),
                variantDensity=rep(0, length(cosmic)), geneDensity=rep(0, length(cosmic))))
  }
  
  if ( onlyCensus ) countsFile = paste0(cosmicDirectory, '/cosmicCounts.Rdata')
  else countsFile = paste0(cosmicDirectory, '/allCosmicCounts_', genome, '.Rdata')
  if ( file.exists(countsFile) ) load(countsFile)
  else {
    if ( onlyCensus ) cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExportCensus.tsv')
    else cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExport.tsv')
    if ( !file.exists(cosmicVariantsFile) ) {
      warning(paste0('couldnt find cosmics file at ', cosmicVariantsFile,'. Will return 0 counts for all variants.'))
    return(list(cosmic=cosmic, found=rep(NA, length(cosmic)),
                variantDensity=rep(NA, length(cosmic)), geneDensity=rep(NA, length(cosmic))))
    }
    cosmicData = read.table(cosmicVariantsFile, fill=T, sep='\t', header=T)
    variantCounts = table(cosmicData$Mutation.ID)
    geneCounts = table(cosmicData$Gene.name)
    geneLengths = aggregate(Gene.CDS.length ~ Gene.name, cosmicData, FUN=max)
    variantGene = aggregate(Gene.name ~ Mutation.ID, cosmicData, FUN=function(genes) genes[1])
    rownames(variantGene) = variantGene[,1]
    nTumors = length(unique(cosmicData$ID_tumour))
    geneDensity = geneCounts/geneLengths[,2]/nTumors*1e6
    variantDensity = variantCounts/nTumors*1e6
    rates = table(cosmicData$Mutation.genome.position)/nTumors
    rates = rates[names(rates) != '']
    chr = gsub(':.*$', '', names(rates))
    pos = as.numeric(gsub('^.*:', '', gsub('-.*$', '', names(rates))))
    x = chrToX(chr, pos, genome=genome)
    df = data.frame(x=x, rates=as.numeric(rates))
    xRates = aggregate(rates ~ x, df, FUN=sum)

    cosmicCounts = list(geneCounts=geneCounts, geneLengths=geneLengths, nTumors=nTumors,
                        geneDensity=geneDensity, variantDensity=variantDensity, variantGene=variantGene, xRates=xRates)
    save(cosmicCounts, file=countsFile)
  }

  #count times the specific variant and gene is hit.
  found = cosmic %in% names(cosmicCounts$variantDensity)
  variantDensity = rep(0, length(cosmic))
  variantDensity[found] = cosmicCounts$variantDensity[cosmic[found]]
  geneDensity = rep(0, length(cosmic))
  geneDensity[found] = cosmicCounts$geneDensity[cosmicCounts$variantGene[cosmic[found],2]]

  return(list(cosmic=cosmic, found=found, variantDensity=variantDensity, geneDensity=geneDensity))
}


getCosmicCensusDensity = function(cosmicDirectory='/wehisan/general/academic/grp_leukemia_genomics/data/resources/COSMIC') {
  if ( cosmicDirectory == '' ) {
    warning('COSMIC directory not specified. To access more cosmic data, download CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv from cosmic, put them in a directory (dont change the names of the files please), and provide the directory to postAnalyseVEP, getMoreVEPinfo or getCosmicCounts.')
    return(c())
  }
  
  countsFile = paste0(cosmicDirectory, '/cosmicCounts.Rdata')
  if ( file.exists(countsFile) ) load(countsFile)
  else {
    cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExportCensus.tsv')
    if ( !file.exists(cosmicVariantsFile) ) {
      warning(paste0('couldnt find cosmics file at ', cosmicVariantsFile,'. Will return 0 counts for all variants.'))
      return(list(cosmic=cosmic, found=rep(NA, length(cosmic)),
                  variantDensity=rep(NA, length(cosmic)), geneDensity=NA))
    }
    cosmicData = read.table(cosmicVariantsFile, fill=T, sep='\t', header=T)
    variantCounts = table(cosmicData$Mutation.ID)
    geneCounts = table(cosmicData$Gene.name)
    geneLengths = aggregate(Gene.CDS.length ~ Gene.name, cosmicData, FUN=max)
    variantGene = aggregate(Gene.name ~ Mutation.ID, cosmicData, FUN=function(genes) genes[1])
    rownames(variantGene) = variantGene[,1]
    nTumors = length(unique(cosmicData$ID_tumour))
    geneDensity = geneCounts/geneLengths[,2]/nTumors*1e6
    variantDensity = variantCounts/nTumors*1e6

    cosmicCounts = list(geneCounts=geneCounts, geneLengths=geneLengths, nTumors=nTumors,
                        geneDensity=geneDensity, variantDensity=variantDensity, variantGene=variantGene)
    save(cosmicCounts, file=countsFile)
  }

  #count times the specific variant and gene is hit.
  censusGenes = names(cosmicCounts$geneDensity)
  censusGenes = gsub('\\_.*$', '', censusGenes)
  uniqueCensusGenes = unique(censusGenes)
  censusGeneDensity = sapply(uniqueCensusGenes, function(gene) sum(cosmicCounts$geneDensity[censusGenes == gene]))
    
  return(censusGeneDensity)
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.