#' Run VEP on the provided variants
#'
#' @param variants variants: The variants to be VEPed.
#' @param plotDir character: The plot directory where outputSomatics have been run.
#' @param cpus integer: The number of cpus to be used as most.
#' @param forceRedoVEP Logical: if VEP should be rerun even if saved data already exists.
#'
#' @details This function calls VEP on the output from outputSomaticVariants. For this, VEP needs to be callable by system('vep').
runVEP = function(variants, plotDir, cpus=1, genome='hg19', vepCall='vep', forceRedoVEP=F, rareGermline=T) {
catLog('VEP-ing..')
dir = paste0(plotDir, '/somatics/')
for ( name in names(variants$variants) ) {
infile = paste0(dir, '/', name, '.txt')
VEPfile = paste0(dir, '/', name, '.VEP.txt')
if ( !file.exists(VEPfile) | forceRedoVEP ) {
wd = getwd()
catLog('Moving to ', dir, '\n')
setwd(dir)
q = variants$variants[[name]]
nVariants = sum(q$somaticP > 0 & !is.na(q$somaticP))
if ( !rareGermline )
nVariants = sum(q$somaticP > 0 & (!q$germline | is.na(q$germline)) & !is.na(q$somaticP))
catLog(name, ': ', nVariants, ' variants.\n', sep='')
if ( nVariants == 0 ) {
catLog('Moving back to ', wd, '\n')
setwd(wd)
next
}
assembly = genomeToAssembly(genome)
#seems like veps format of the version output in the --help changed, so skipping version check and assuming 89...
if ( FALSE ) {
catLog('Checking VEP version.\n')
catLog(paste0(vepCall, ' --help'), '\n')
vepHelp = system(paste0(vepCall, ' --help'), intern=T)
versionLine = strsplit(vepHelp[grep('^version', vepHelp)], ' ')[[1]]
vepVersion = as.numeric(versionLine[length(versionLine)])
}
vepVersion = 89
if ( vepVersion < 76 & genome == 'hg38' ) {
warning('VEP seems to be pre version 76, which isnt supported for hg38. Please update VEP. Running without annotation for now.')
return(variants)
}
if ( vepVersion < 76 ) {
catLog('Using pre-76 VEP version call. Not supported, but trying...\n')
if ( cpus == 1 )
call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite')
else
call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --fork ', cpus)
}
else {
if ( cpus == 1 )
call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --assembly ', assembly)
else
call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --assembly ', assembly, ' --fork ', cpus)
}
if ( genome == 'mm10' ) call = paste0(vepCall, ' -i ', basename(infile), ' -o ', basename(VEPfile), ' --cache --everything --force_overwrite --fork ', cpus, ' --species mus_musculus')
catLog(call, '\n')
call = paste0(call, '\n')
systemRet = system(call, intern=T)
#not sure if the current VEP version use the word "Finished", so removing this warning.
#if ( !any(grepl('Finished', systemRet)) ) warning('VEP run didnt finish cleanly!')
catLog('Moving back to ', wd, '\n')
setwd(wd)
}
}
catLog('done.\n')
catLog('Importing VEP results:\n')
for ( name in names(variants$variants) ) {
catLog(name, ': ', sep='')
if ( sum(variants$variants[[name]]$somaticP > 0 & !is.na(variants$variants[[name]]$somaticP)) == 0 ) {
catLog('no somatic variants.\n')
variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
next
}
VEPfile = paste0(dir, '/', name, '.VEP.txt')
VEPdata = try(read.table(VEPfile, fill=T, header=F, colClasses='character'), silent=T)
if ( inherits(VEPdata, 'try-error') ) {
catLog('failed to read VEP file ', VEPfile,'.\n')
variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
}
else {
ID = as.numeric(as.factor(VEPdata$V1))
chr = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[1])
pos = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[2])
end = as.numeric(gsub('^.*-', '',pos))
pos = as.numeric(gsub('-.*$', '',pos))
var = as.character(VEPdata$V3)
deletions = grepl('-', var)
if ( any(deletions) ) pos[deletions] = pos[deletions] - 1
type = as.character(VEPdata$V7)
sev = sapply(type, typeToSeverity)
x = chrToX(chr, pos, genome=genome)
lastCol = strsplit(as.character(VEPdata$V14), ';')
polyPhen = sapply(lastCol, function(strs) {
if ( any(grepl('PolyPhen=', strs)) ) return(as.numeric(gsub('\\)$', '', gsub('^.*\\(', '', strs[grep('PolyPhen=',strs)[1]]))))
else return(0.05)
})
namevar = var
namevar[nchar(namevar) > 1] = paste0('+', substring(var[nchar(var) > 1], 2))
namevar[namevar == '-'] = paste0('-', pmax(1, as.numeric(end)[namevar == '-']-as.numeric(pos)[namevar == '-']))
rowNames = paste0(x, namevar)
qNames = rep('', length(unique(ID)))
qNames[ID] = rowNames
#find most severe effect
mostSev = rep('unkown', length(unique(ID)))
sevScore = rep(100, length(unique(ID)))
for ( i in 1:nrow(VEPdata) ) {
sevI = sev[i]
IDI = ID[i]
if ( sevI - polyPhen[i] < sevScore[IDI] ) {
sevScore[IDI] = sevI - polyPhen[i]
mostSev[IDI] = severityToType(sevI)
}
}
#add effect and severity to q
variants$variants[[name]]$type = rep('notChecked', nrow(variants$variants[[name]]))
variants$variants[[name]]$severity = rep(100, nrow(variants$variants[[name]]))
variants$variants[[name]][qNames,]$type = mostSev
variants$variants[[name]][qNames,]$severity = sevScore
catLog(length(mostSev), 'VEPed variants.\n')
}
}
catLog('done.\n')
return(variants)
}
genomeToAssembly = function(genome) {
if ( genome == 'hg19' ) return('GRCh37')
if ( genome == 'hg38' ) return('GRCh38')
return('')
}
#' Ranks the variant effects
#'
#' @param type character: The effect.
#'
#' @details Returns a low score for severe effects, higher for less severe, 100 for unknown. 11 and below alters the protein.
typeToSeverity = function(type, annotationMethod='VEP') {
if ( annotationMethod == 'VariantAnnotation' ) return(VAconsequenceToSeverityRank(type))
#first change some (older? newer?) notation into the ensemble SO terms
type = gsub('non_coding_exon_variant', 'non_coding_transcript_exon_variant', type)
type = gsub('nc_transcript_variant', 'non_coding_transcript_exon_variant', type)
if ( grepl('transcript_ablation', type) ) return(1)
if ( grepl('splice_acceptor_variant', type) ) return(2)
if ( grepl('splice_donor_variant', type) ) return(3)
if ( grepl('stop_gained', type) ) return(4)
if ( grepl('frameshift_variant', type) ) return(5)
if ( grepl('stop_lost', type) ) return(6)
if ( grepl('start_lost', type) ) return(7)
if ( grepl('initiator_codon_variant', type) ) return(7)
if ( grepl('transcript_amplification', type) ) return(8)
if ( grepl('inframe_insertion', type) ) return(9)
if ( grepl('inframe_deletion', type) ) return(10)
if ( grepl('missense_variant', type) ) return(11)
if ( grepl('protein_altering_variant', type) ) return(12)
if ( grepl('splice_region_variant', type) ) return(13)
if ( grepl('incomplete_terminal_codon_variant', type) ) return(14)
if ( grepl('stop_retained_variant', type) ) return(15)
if ( grepl('synonymous_variant', type) ) return(16)
if ( grepl('coding_sequence_variant', type) ) return(17)
if ( grepl('mature_miRNA_variant', type) ) return(18)
if ( grepl('5_prime_UTR_variant', type) ) return(19)
if ( grepl('3_prime_UTR_variant', type) ) return(20)
if ( grepl('non_coding_transcript_exon_variant', type) ) return(21)
if ( grepl('intron_variant', type) ) return(22)
if ( grepl('NMD_transcript_variant', type) ) return(23)
if ( grepl('non_coding_transcript_variant', type) ) return(24)
if ( grepl('upstream_gene_variant', type) ) return(25)
if ( grepl('downstream_gene_variant', type) ) return(26)
if ( grepl('TFBS_ablation', type) ) return(27)
if ( grepl('TFBS_amplification', type) ) return(28)
if ( grepl('TF_binding_site_variant', type) ) return(29)
if ( grepl('regulatory_region_ablation', type) ) return(30)
if ( grepl('regulatory_region_amplification', type) ) return(31)
if ( grepl('regulatory_region_variant', type) ) return(32)
if ( grepl('feature_elongation', type) ) return(33)
if ( grepl('feature_truncation', type) ) return(34)
if ( grepl('intergenic_variant', type) ) return(35)
if ( type == '-' ) return(100)
if ( grepl('unknown', type) ) return(100)
catLog('Dont know about the mutation type: ', type, '\n')
return(100)
}
#' The variant effect as function of severity
#'
#' @param severity integer: The severity rank.
#'
#' @details Translated back from severity rank to the variant effect.
severityToType = function(severity, annotationMethod='VEP') {
if ( annotationMethod == 'VariantAnnotation' ) return(VAseverityToConsequence(severity))
if ( severity == 1 ) return('transcript_ablation')
if ( severity == 2 ) return('splice_acceptor_variant')
if ( severity == 3 ) return('splice_donor_variant')
if ( severity == 4 ) return('stop_gained')
if ( severity == 5 ) return('frameshift_variant')
if ( severity == 6 ) return('stop_lost')
if ( severity == 7 ) return('start_lost')
if ( severity == 8 ) return('transcript_amplification')
if ( severity == 9 ) return('inframe_insertion')
if ( severity == 10 ) return('inframe_deletion')
if ( severity == 11 ) return('missense_variant')
if ( severity == 12 ) return('protein_altering_variant')
if ( severity == 13 ) return('splice_region_variant')
if ( severity == 14 ) return('incomplete_terminal_codon_variant')
if ( severity == 15 ) return('stop_retained_variant')
if ( severity == 16 ) return('synonymous_variant')
if ( severity == 17 ) return('coding_sequence_variant')
if ( severity == 18 ) return('mature_miRNA_variant')
if ( severity == 19 ) return('5_prime_UTR_variant')
if ( severity == 20 ) return('3_prime_UTR_variant')
if ( severity == 21 ) return('non_coding_transcript_exon_variant')
if ( severity == 22 ) return('intron_variant')
if ( severity == 23 ) return('NMD_transcript_variant')
if ( severity == 24 ) return('non_coding_transcript_variant')
if ( severity == 25 ) return('upstream_gene_variant')
if ( severity == 26 ) return('downstream_gene_variant')
if ( severity == 27 ) return('TFBS_ablation')
if ( severity == 28 ) return('TFBS_amplification')
if ( severity == 29 ) return('TF_binding_site_variant')
if ( severity == 30 ) return('regulatory_region_ablation')
if ( severity == 31 ) return('regulatory_region_amplification')
if ( severity == 32 ) return('regulatory_region_variant')
if ( severity == 33 ) return('feature_elongation')
if ( severity == 34 ) return('feature_truncation')
if ( severity == 35 ) return('intergenic_variant')
if ( severity == 100 ) return('unknown')
return('unknown')
}
#' Runs VEP on a R and plot directory that superFreq has been run on.
#'
#' @param outputDirectories named list with Rdirectory and plotDirectory.
#' @param inputFiles named list as in analyse.
#' @param genome character: such as 'hg19', 'hg38' or 'mm10'
#' @param cpus integer. maximum number of parallel threads.
#' @param cosmicDirectory character: path to COSMIC directory. (optional)
#' @param forceRedo boolean: if TRUE, previously saved data is ignored and overwritten.
#'
#' @export
#' @details Runs VEP on the variants, and saves the variants (retrivable with loadData) with the addition
#' information from VEP, and COSMIC if provided.
postAnalyseVEP = function(outputDirectories, inputFiles=NA, metaData=NA, genome='hg19', cpus=1, cosmicDirectory='', vepCall='vep', forceRedo=F) {
Rdirectory = outputDirectories$Rdirectory
plotDirectory = outputDirectories$plotDirectory
data = loadData(Rdirectory)
if ( !('allVariants' %in% names(data)) ) {
warning('Cant find a saved allVariants.\n')
return()
}
if ( genome %in% c('hg19', 'hg38') & 'cosmic' %in% names(data$allVariants$variants$variants[[1]]) & !forceRedo ) return()
if ( genome == 'mm10' & 'isCCGD' %in% names(data$allVariants$variants$variants[[1]]) & !forceRedo ) return()
backup = paste0(Rdirectory, '/allVariantsPreVEP.Rdata')
if ( !file.exists(backup) ) {
catLog('Backing up variants (in case the vep run goes wrong) to', backup, '\n')
allVariantsPreVEP = data$allVariants
if ( !is.null(allVariantsPreVEP) )
save('allVariantsPreVEP', file=backup)
}
variants = data$allVariants$variants
if ( !('allVariants' %in% names(data)) & 'variants' %in% names(data) )
variants = data$variants
if ( inherits(metaData, 'data.frame') )
sampleMetaData = metaData
else if ( inherits(inputFiles, 'list') )
sampleMetaData = importSampleMetaData(inputFiles$metaDataFile)
else
stop('Neither inputFiles nor metaData was defined.')
normals = as.logical(gsub('YES', 'T', gsub('NO', 'F', sampleMetaData$NORMAL)))
names = make.names(sampleMetaData$NAME, unique=T)
names(normals) = names
individuals = sampleMetaData$INDIVIDUAL
timePoints = sampleMetaData$TIMEPOINT
names(timePoints) = names(individuals) = names
samplePairs = metaToSamplePairs(names, individuals, normals)
timeSeries = metaToTimeSeries(names, individuals, normals)
outputSomaticVariants(variants, genome=genome, plotDirectory=plotDirectory, cpus=cpus, forceRedo=forceRedo)
variants = runVEP(variants, plotDirectory, cpus=cpus, genome=genome, vepCall=vepCall, forceRedoVEP=forceRedo)
variants = getMoreVEPinfo(variants, plotDirectory, genome=genome, cosmicDirectory=cosmicDirectory)
allVariants = data$allVariants
allVariants$variants = variants
allVariantSaveFile = paste0(Rdirectory, '/allVariants.Rdata')
catLog('Saving fully annotated variants to', allVariantSaveFile, '...')
save('allVariants', file=allVariantSaveFile)
catLog('done.\n')
storiesSaveFile = paste0(Rdirectory, '/stories.Rdata')
catLog('Replacing fully annotated variants in', storiesSaveFile, '...')
load(storiesSaveFile)
stories$variants = variants
save('stories', file=storiesSaveFile)
catLog('done.\n')
outputSomaticVariants(variants, genome, plotDirectory, cpus=cpus, forceRedo=T)
makeSNPprogressionPlots(variants, timeSeries=timeSeries, normals = normals, plotDirectory=plotDirectory,
genome=genome, forceRedo=T, maxRowCluster=2000)
makeRiverPlots(data$stories$stories, variants, genome=genome, cpus=cpus, plotDirectory=plotDirectory, forceRedo=T)
makeScatterPlots(variants, samplePairs, timePoints, plotDirectory, genome=genome, cpus=cpus, forceRedo=T)
makeCloneScatterPlots(variants, data$stories$stories, samplePairs, individuals, timePoints,
plotDirectory, genome=genome, cpus=cpus, forceRedo=T)
outputNewVariants(variants, samplePairs, genome, plotDirectory, cpus=cpus, forceRedo=T)
}
#' Import more information about the variants
#'
#' @param variants variants: The variants
#' @param plotDirectory character: The plot directory from the analyse call.
#'
#' @details Extract almost all the information from the VEP run, and cross-checks with COSMIC data as well.
#' getCosmicCounts is called by this function, see details of that for requirements.
getMoreVEPinfo = function(variants, plotDirectory, genome='hg19', cosmicDirectory='') {
dir = paste0(plotDirectory, '/somatics')
catLog('Importing more VEP info:\n')
for ( name in names(variants$variants) ) {
catLog(name, '.. ')
if ( sum(variants$variants[[name]]$somaticP > 0 & !is.na(variants$variants[[name]]$somaticP)) == 0 ) {
catLog('no somatic variants.\n')
variants$variants[[name]] = addNullAnnotation(variants$variants[[name]], genome=genome)
next
}
else {
VEPfile = paste0(dir, '/', name, '.VEP.txt')
VEPdata = try(read.table(VEPfile, fill=T, header=F, colClasses='character'), silent=T)
if ( inherits(VEPdata, 'try-error') )
catLog('failed to read VEP file ', VEPfile,'.\n')
ID = as.numeric(as.factor(VEPdata$V1))
var = as.character(VEPdata$V3)
chr = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[1])
pos = sapply(strsplit(as.character(VEPdata$V2), ':'), function(v) v[2])
end = as.numeric(gsub('^.*-', '',pos))
pos = as.numeric(gsub('-.*$', '',pos))
deletions = grepl('-', var)
if ( any(deletions) ) pos[deletions] = pos[deletions] - 1
type = as.character(VEPdata$V7)
sev = sapply(type, typeToSeverity)
x = chrToX(chr, pos, genome)
AApos = VEPdata$V10
AAbefore = gsub('\\/.*$', '', VEPdata$V11)
AAafter = gsub('^.*\\/', '', VEPdata$V11)
secondLastCol = strsplit(as.character(VEPdata$V13), ',')
lastCol = strsplit(as.character(VEPdata$V14), ';')
polyPhen = sapply(lastCol, function(strs) {
if ( any(grepl('PolyPhen=', strs)) ) return(gsub('^PolyPhen=', '', strs[grep('PolyPhen=',strs)[1]]))
else return('')
})
numericPolyPhen = sapply(lastCol, function(strs) {
if ( any(grepl('PolyPhen=', strs)) )
return(as.numeric(gsub('\\)$', '', gsub('^.*\\(', '', strs[grep('PolyPhen=',strs)[1]]))))
else return(0.0)
})
symbol = sapply(lastCol, function(strs) {
if ( any(grepl('SYMBOL=', strs)) ) return(gsub('^SYMBOL=', '', strs[grep('SYMBOL=',strs)[1]]))
else return('')
})
exon = sapply(lastCol, function(strs) {
if ( any(grepl('EXON=', strs)) ) return(gsub('^EXON=', '', strs[grep('EXON=',strs)[1]]))
else return('')
})
sift = sapply(lastCol, function(strs) {
if ( any(grepl('SIFT=', strs)) ) return(gsub('^SIFT=', '', strs[grep('SIFT=',strs)[1]]))
else return('')
})
domain = sapply(lastCol, function(strs) {
if ( any(grepl('DOMAINS=', strs)) ) return(gsub('\\,.*$', '', gsub('^DOMAINS=', '', strs[grep('DOMAINS=',strs)[1]])))
else return('')
})
if ( genome %in% c('hg19', 'hg38') ) {
cosmic = sapply(secondLastCol, function(strs) {
if ( any(grepl('COSM', strs)) ) return(strs[grep('COSM',strs)[1]])
else return('')
})
cosmicCounts = getCosmicCounts(cosmic, cosmicDirectory=cosmicDirectory, genome=genome)
isCosmicCensus = cosmicCounts$found
cosmicCounts = getCosmicCounts(cosmic, cosmicDirectory=cosmicDirectory, onlyCensus=F, genome=genome)
cosmicVariantDensity = cosmicCounts$variantDensity
cosmicGeneDensity = cosmicCounts$geneDensity
censusDensity = getCosmicCensusDensity(cosmicDirectory=cosmicDirectory)
noHitCensus = !isCosmicCensus & symbol %in% names(censusDensity) & sev <= 11
if ( any(noHitCensus) ) {
isCosmicCensus[noHitCensus] = T
cosmicGeneDensity[noHitCensus] = censusDensity[symbol[noHitCensus]]
}
}
if ( genome == 'mm10' ) {
catLog('adding CCGD data..')
CCGDdata = read.table(paste0(cosmicDirectory, '/CCGD_export.csv'), sep=',', header=T, stringsAsFactors=F)
censusGenes = unique(CCGDdata$Mouse.Symbol)
isCCGD = symbol %in% censusGenes
CCGDsummary = sapply(censusGenes, function(gene) {
subData = CCGDdata[CCGDdata$Mouse.Symbol == gene,]
studies = subData$Studies[1]
humanGene = subData$Human.Symbol[1]
cancerTypes = sort(table(subData$Cancer.Type), decreasing=T)
names(cancerTypes) = gsub(' Cancer', '',names(cancerTypes))
cancerTypes = gsub(';$','',do.call(paste0, as.list(paste0(names(cancerTypes), ":", cancerTypes, ';'))))
isInCosmic = subData$COSMIC[1] == 'Yes'
isInCGC = subData$CGC[1] == 'Yes'
ranks = table(c('A', 'B', 'C', 'D', subData$Relative.Rank))
ranks[c('A', 'B', 'C', 'D')] = ranks[c('A', 'B', 'C', 'D')] - 1
names(ranks) = gsub(' Cancer', '',names(ranks))
ranks = gsub(';$','',do.call(paste0, as.list(paste0(names(ranks), ":", ranks, ';'))))
return(c('studies'=studies, 'cancerTypes'=cancerTypes, 'cosmic'=isInCosmic,
'cgc'=isInCGC, 'ranks'=ranks, 'humanGene'=humanGene))
})
CCGDsummary = as.data.frame(t(CCGDsummary), stringsAsFactors=F)
rownames(CCGDsummary) = censusGenes
CCGDsummary$studies = as.numeric(CCGDsummary$studies)
CCGDsummary$cosmic = as.logical(CCGDsummary$cosmic)
CCGDsummary$cgc = as.logical(CCGDsummary$cgc)
CCGDstudies = rep(0, length(symbol))
CCGDcancerTypes = rep('', length(symbol))
CCGDcosmic = rep('', length(symbol))
CCGDcgc = rep('', length(symbol))
CCGDranks = rep('', length(symbol))
CCGDstudies[isCCGD] = CCGDsummary[symbol[isCCGD],]$studies
CCGDcancerTypes[isCCGD] = CCGDsummary[symbol[isCCGD],]$cancerTypes
CCGDcosmic[isCCGD] = ifelse(CCGDsummary[symbol[isCCGD],]$cosmic, 'YES!', 'no')
CCGDcgc[isCCGD] = ifelse(CCGDsummary[symbol[isCCGD],]$cgc, 'YES!', 'no')
CCGDranks[isCCGD] = CCGDsummary[symbol[isCCGD],]$ranks
catLog('found ', sum(isCCGD), ' entries.\n', sep='')
}
namevar = var
namevar[nchar(namevar) > 1] = paste0('+', substring(var[nchar(var) > 1], 2))
namevar[namevar == '-'] = paste0('-', pmax(1, as.numeric(end)[namevar == '-']-as.numeric(pos)[namevar == '-']))
rowNames = paste0(x, namevar)
qNames = rep('', length(unique(ID)))
qNames[ID] = rowNames
#find most severe effect
mostSev = rep('unkown', length(unique(ID)))
sevScore = rep(100, length(unique(ID)))
polyPhenRet = rep('', length(unique(ID)))
exonRet = rep('', length(unique(ID)))
siftRet = rep('', length(unique(ID)))
symbolRet = rep('', length(unique(ID)))
AAposRet = rep('', length(unique(ID)))
AAbeforeRet = rep('', length(unique(ID)))
AAafterRet = rep('', length(unique(ID)))
domainRet = rep('', length(unique(ID)))
cosmicRet = rep('', length(unique(ID)))
isCosmicCensusRet = rep(F, length(unique(ID)))
cosmicVariantDensityRet = rep(0, length(unique(ID)))
cosmicGeneDensityRet = rep(0, length(unique(ID)))
CCGDstudiesRet = rep(0, length(unique(ID)))
CCGDcancerTypesRet = rep('', length(unique(ID)))
CCGDcosmicRet = rep(F, length(unique(ID)))
CCGDcgcRet = rep(F, length(unique(ID)))
CCGDranksRet = rep('', length(unique(ID)))
for ( i in 1:nrow(VEPdata) ) {
sevI = sev[i]
IDI = ID[i]
if ( sevI - numericPolyPhen[i] < sevScore[IDI] ) {
sevScore[IDI] = sevI - numericPolyPhen[i]
mostSev[IDI] = severityToType(sevI)
polyPhenRet[IDI] = polyPhen[i]
exonRet[IDI] = exon[i]
siftRet[IDI] = sift[i]
symbolRet[IDI] = symbol[i]
AAposRet[IDI] = AApos[i]
AAbeforeRet[IDI] = AAbefore[i]
AAafterRet[IDI] = AAafter[i]
domainRet[IDI] = domain[i]
if ( genome %in% c('hg19', 'hg38') ) {
cosmicRet[IDI] = cosmic[i]
isCosmicCensusRet[IDI] = isCosmicCensus[i]
cosmicVariantDensityRet[IDI] = cosmicVariantDensity[i]
cosmicGeneDensityRet[IDI] = cosmicGeneDensity[i]
}
if ( genome == 'mm10' ) {
CCGDstudiesRet[IDI] = CCGDstudies[i]
CCGDcancerTypesRet[IDI] = CCGDcancerTypes[i]
CCGDcosmicRet[IDI] = CCGDcosmic[i]
CCGDranksRet[IDI] = CCGDcgc[i]
CCGDranksRet[IDI] = CCGDranks[i]
}
}
}
#add effect and severity to q
variants$variants[[name]] = addNullAnnotation(variants$variants[[name]], genome=genome)
variants$variants[[name]][qNames,]$severity = sevScore
variants$variants[[name]][qNames,]$type = mostSev
variants$variants[[name]][qNames,]$polyPhen = polyPhenRet
variants$variants[[name]][qNames,]$sift = siftRet
for ( sampleName in names(variants$variants) ) {
present = qNames %in% rownames(variants$variants[[sampleName]])
variants$variants[[sampleName]][qNames[present],]$inGene[symbolRet[present] != ''] = symbolRet[present][symbolRet[present] != '']
}
variants$variants[[name]][qNames,]$exon = exonRet
variants$variants[[name]][qNames,]$AApos = AAposRet
variants$variants[[name]][qNames,]$AAbefore = AAbeforeRet
variants$variants[[name]][qNames,]$AAafter = AAafterRet
variants$variants[[name]][qNames,]$domain = domainRet
if ( genome %in% c('hg19', 'hg38') ) {
variants$variants[[name]][qNames,]$cosmic = cosmicRet
variants$variants[[name]][qNames,]$isCosmicCensus = isCosmicCensusRet
variants$variants[[name]][qNames,]$cosmicVariantMPM = cosmicVariantDensityRet
variants$variants[[name]][qNames,]$cosmicGeneMPMPB = cosmicGeneDensityRet
}
if ( genome == c('mm10') ) {
variants$variants[[name]][qNames,]$CCGDstudies = CCGDstudiesRet
variants$variants[[name]][qNames,]$CCGDcancerTypes = CCGDcancerTypesRet
variants$variants[[name]][qNames,]$CCGDcosmic = CCGDcosmicRet
variants$variants[[name]][qNames,]$CCGDcgc = CCGDcgcRet
variants$variants[[name]][qNames,]$CCGDranks = CCGDranksRet
}
catLog(length(mostSev), 'VEPed variants.\n')
}
}
catLog('done.\n')
return(variants)
}
addNullAnnotation = function(q, genome='hg19') {
q$severity = rep(100, nrow(q))
q$type = rep('unknown', nrow(q))
q$polyPhen = rep('', nrow(q))
q$sift = rep('', nrow(q))
q$exon = rep('', nrow(q))
q$AApos = rep('', nrow(q))
q$AAbefore = rep('', nrow(q))
q$AAafter = rep('', nrow(q))
q$domain = rep('', nrow(q))
if ( genome %in% c('hg19', 'hg38') ) {
q$cosmic = rep('', nrow(q))
q$isCosmicCensus = rep(F, nrow(q))
q$cosmicVariantMPM = rep(0, nrow(q))
q$cosmicGeneMPMPB = rep(0, nrow(q))
}
if ( genome == 'mm10' ) {
q$isCCGD = rep(F, nrow(q))
q$CCGDstudies = rep(0, nrow(q))
q$CCGDcancerTypes = rep('', nrow(q))
q$CCGDcosmic = rep(F, nrow(q))
q$CCGDcgc = rep(F, nrow(q))
q$CCGDranks = rep('', nrow(q))
}
return(q)
}
#' internal function
#'
#' @details Just remembers which columns are added by getMoreVEPinfo
moreVEPnames = function(genome='hg19') {
if ( genome %in% c('hg19', 'hg38') ) return(c('polyPhen', 'sift', 'exon', 'AApos', 'AAbefore', 'AAafter', 'domain', 'cosmic', 'isCosmicCensus', 'cosmicVariantMPM', 'cosmicGeneMPMPB'))
if ( genome == 'mm10' ) return(c('polyPhen', 'sift', 'exon', 'AApos', 'AAbefore', 'AAafter', 'domain', 'isCCGD', 'CCGDstudies', 'CCGDcancerTypes', 'CCGDcosmic', 'CCGDcgc', 'CCGDranks'))
}
#' Retrieves information about cosmic variant IDs
#'
#' @param cosmic character: The cosmic IDs.
#' @param cosmicDirectory character: The directory where the cosmic files CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv are located.
#' @param onlyCensus logical: Restrict to cosmic census genes.
#'
#' @details For each cosmic ID, return whether it is seen in a census gene, and the mutations per million tumors (MPM) for that variant, as well as the average MPM over the gene (mutations per million tumors per basepair, MPMPB).
getCosmicCounts = function(cosmic, cosmicDirectory='/wehisan/general/academic/grp_leukemia_genomics/data/resources/COSMIC', onlyCensus=T, genome='hg19') {
if ( cosmicDirectory == '' ) {
warning('COSMIC directory not specified. To access more cosmic data, download CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv from cosmic, put them in a directory (dont change the names of the files please), and provide the directory to postAnalyseVEP, getMoreVEPinfo or getCosmicCounts.')
return(list(cosmic=cosmic, found=rep(F, length(cosmic)),
variantDensity=rep(0, length(cosmic)), geneDensity=rep(0, length(cosmic))))
}
if ( onlyCensus ) countsFile = paste0(cosmicDirectory, '/cosmicCounts.Rdata')
else countsFile = paste0(cosmicDirectory, '/allCosmicCounts_', genome, '.Rdata')
if ( file.exists(countsFile) ) load(countsFile)
else {
if ( onlyCensus ) cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExportCensus.tsv')
else cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExport.tsv')
if ( !file.exists(cosmicVariantsFile) ) {
warning(paste0('couldnt find cosmics file at ', cosmicVariantsFile,'. Will return 0 counts for all variants.'))
return(list(cosmic=cosmic, found=rep(NA, length(cosmic)),
variantDensity=rep(NA, length(cosmic)), geneDensity=rep(NA, length(cosmic))))
}
cosmicData = read.table(cosmicVariantsFile, fill=T, sep='\t', header=T)
variantCounts = table(cosmicData$Mutation.ID)
geneCounts = table(cosmicData$Gene.name)
geneLengths = aggregate(Gene.CDS.length ~ Gene.name, cosmicData, FUN=max)
variantGene = aggregate(Gene.name ~ Mutation.ID, cosmicData, FUN=function(genes) genes[1])
rownames(variantGene) = variantGene[,1]
nTumors = length(unique(cosmicData$ID_tumour))
geneDensity = geneCounts/geneLengths[,2]/nTumors*1e6
variantDensity = variantCounts/nTumors*1e6
rates = table(cosmicData$Mutation.genome.position)/nTumors
rates = rates[names(rates) != '']
chr = gsub(':.*$', '', names(rates))
pos = as.numeric(gsub('^.*:', '', gsub('-.*$', '', names(rates))))
x = chrToX(chr, pos, genome=genome)
df = data.frame(x=x, rates=as.numeric(rates))
xRates = aggregate(rates ~ x, df, FUN=sum)
cosmicCounts = list(geneCounts=geneCounts, geneLengths=geneLengths, nTumors=nTumors,
geneDensity=geneDensity, variantDensity=variantDensity, variantGene=variantGene, xRates=xRates)
save(cosmicCounts, file=countsFile)
}
#count times the specific variant and gene is hit.
found = cosmic %in% names(cosmicCounts$variantDensity)
variantDensity = rep(0, length(cosmic))
variantDensity[found] = cosmicCounts$variantDensity[cosmic[found]]
geneDensity = rep(0, length(cosmic))
geneDensity[found] = cosmicCounts$geneDensity[cosmicCounts$variantGene[cosmic[found],2]]
return(list(cosmic=cosmic, found=found, variantDensity=variantDensity, geneDensity=geneDensity))
}
getCosmicCensusDensity = function(cosmicDirectory='/wehisan/general/academic/grp_leukemia_genomics/data/resources/COSMIC') {
if ( cosmicDirectory == '' ) {
warning('COSMIC directory not specified. To access more cosmic data, download CosmicMutantExportCensus.tsv and CosmicMutantExport.tsv from cosmic, put them in a directory (dont change the names of the files please), and provide the directory to postAnalyseVEP, getMoreVEPinfo or getCosmicCounts.')
return(c())
}
countsFile = paste0(cosmicDirectory, '/cosmicCounts.Rdata')
if ( file.exists(countsFile) ) load(countsFile)
else {
cosmicVariantsFile = paste0(cosmicDirectory, '/CosmicMutantExportCensus.tsv')
if ( !file.exists(cosmicVariantsFile) ) {
warning(paste0('couldnt find cosmics file at ', cosmicVariantsFile,'. Will return 0 counts for all variants.'))
return(list(cosmic=cosmic, found=rep(NA, length(cosmic)),
variantDensity=rep(NA, length(cosmic)), geneDensity=NA))
}
cosmicData = read.table(cosmicVariantsFile, fill=T, sep='\t', header=T)
variantCounts = table(cosmicData$Mutation.ID)
geneCounts = table(cosmicData$Gene.name)
geneLengths = aggregate(Gene.CDS.length ~ Gene.name, cosmicData, FUN=max)
variantGene = aggregate(Gene.name ~ Mutation.ID, cosmicData, FUN=function(genes) genes[1])
rownames(variantGene) = variantGene[,1]
nTumors = length(unique(cosmicData$ID_tumour))
geneDensity = geneCounts/geneLengths[,2]/nTumors*1e6
variantDensity = variantCounts/nTumors*1e6
cosmicCounts = list(geneCounts=geneCounts, geneLengths=geneLengths, nTumors=nTumors,
geneDensity=geneDensity, variantDensity=variantDensity, variantGene=variantGene)
save(cosmicCounts, file=countsFile)
}
#count times the specific variant and gene is hit.
censusGenes = names(cosmicCounts$geneDensity)
censusGenes = gsub('\\_.*$', '', censusGenes)
uniqueCensusGenes = unique(censusGenes)
censusGeneDensity = sapply(uniqueCensusGenes, function(gene) sum(cosmicCounts$geneDensity[censusGenes == gene]))
return(censusGeneDensity)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.