#' unpackExacData
#'
#' Main entry point to unpack Exac vcf data. Unpacking is needed because multiallelics are hard to deal with,
#' so each alt allele is split into its separate row. Fields which are lists are converted to scalars.
#' FIXME! very slow, need to see if we can optimize
#' @param exacData Raw Exac data frame as returned from genomicRTools R package
#' @return Data frame with each row being a separate alt allele.
#' @export
#' @examples
#' exacData <- unpackExacData(exacData,'ENST0000323456')
unpackExacData <- function(exacData, transcriptID) {
init=F
# unpack multiallelic data and form nicely shaped data frame
for (i in 1:nrow(exacData) ) {
if (i %% 100 == 0) print(i)
# expand multiallelics
unpackedRow<-unpackVCFRow(exacData[i,],transcriptID)
if (!init) {
unpackedData=unpackedRow
init=T
} else unpackedData=rbind(unpackedData,unpackedRow)
}
return(unpackedData)
}
unpackVCFRow <- function(vcfRow, transcriptID) {
#print(vcfRow)
# unpack multiallelics into one row per alt allele
fieldsToUnpack <- c("AC","AC_AFR","AC_AMR","AC_Adj","AC_EAS","AC_FIN","AC_Hemi","AC_Het","AC_Hom",
"AC_NFE","AC_OTH","AC_SAS","AF",
"Hemi_AFR","Hemi_AMR","Hemi_EAS","Hemi_FIN","Hemi_NFE","Hemi_OTH","Hemi_SAS",
"Het_AFR","Het_AMR","Het_EAS","Het_FIN","Het_NFE","Het_OTH","Het_SAS",
"Hom_AFR","Hom_AMR","Hom_EAS","Hom_FIN","Hom_NFE","Hom_OTH","Hom_SAS",
"MLEAC","MLEAF")
df <- as.data.frame(vcfRow)
# build DF with all fields except the ones we need to unpack and convert from list to scalar
altIdx=which(names(vcfRow) %in% fieldsToUnpack)
newRows <- df[,-altIdx]
altAlleles<-as.character(vcfRow$ALT[[1]])
# special treatment for CSQ value: keep only transcript ID passed in, match alt alleles:
csq<- (Filter(function(x) length(grep(transcriptID,x))>0,df$CSQ[[1]])) # only keep annotations for given transcript ID
#print(csq)
csqList=lapply(csq, splitCSQAnnotation)
annotationAlleles=as.character(unlist(lapply(csqList,function(x) x$Allele)))
annotationAlleles[annotationAlleles=="-"]=""
allAlleles <- c(newRows$REF,altAlleles)
commonBases <- sapply(altAlleles,function(x,ref) commonSubstring(c(ref,x)), newRows$REF)
for (i in 1:length(altAlleles)) {
newDF<-newRows
# change alt,
newDF$ALT=altAlleles[[i]]
# for all columns which are lists, pick element i
for (fi in fieldsToUnpack) {
valList=df[,fi]
#print(valList)
newDF[,fi]=valList[[1]][i]
}
if (length(csqList) >0) {
if (length(altAlleles)== 1) {
csqEl=csqList
} else {
commonBase <- commonBases[[which(names(commonBases) == newDF$ALT)]]
#csqAllele <- paste0(commonBase,x$Allele)
# pick elements d in csqList such that csqAllele matches
csqEl<- (Filter(function(x) {
d <- if(x$Allele=="-") commonBase else paste0(commonBase,x$Allele)
d==newDF$ALT
},csqList))
}
if (length(csqEl)==0) {
newDF <- newDF[0,] # no matching annotation found
}
else {
newDF<-cbind(newDF,(csqEl[[1]])[,-1])
}
} else {
# workaround for empty csq list: happens if we don't have annotation for requested transcript id
newDF <- newDF[0,]
}
result<- if (i==1) newDF else rbind(result,newDF)
}
return(result)
}
#' splitCSQAnnotation
#'
#' Helper function to split CSQ annotation in Exac's INFO field
#' @param csqString Raw CSQ string as read from INFO field
#' @return Data frame with each column being a field on CSQ. If annotation has multiple values (e.g. due to multi-allelic), each allele is a separate row
#' @export
#' @examples
#' csqAnnot<- splitCSQAnnotation('C|ENSG00000223972|ENST00000456328|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|620||||||1||1|DDX11L1|HGNC|37102|processed_transcript|YES||||||||3/3|||ENST00000456328.2:n.620G>C|||||||||||||||||||,C|ENSG00000223972|ENST00000450305|Transcript|splice_region_variant&non_coding_transcript_exon_variant&non_coding_transcript_variant|412||||||1||1|DDX11L1|HGNC|37102|transcribed_unprocessed_pseudogene|||||||||5/6|||ENST00000450305.2:n.412G>C|||||||||||||||||||')
splitCSQAnnotation <- function(csqString) {
EXAC_VEP_FIELDS_31 <- 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|ASN_MAF|EAS_MAF|EUR_MAF|SAS_MAF|AA_MAF|EA_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|context|ancestral'
csqFormat <- strsplit(EXAC_VEP_FIELDS_31,
'\\|')[[1]]
# split CSQ string into data frame with names columns
a=strsplit(csqString,"\\|")
b=as.data.frame(t(unlist(a)))
# FIXME? if last field was empty there is a column missing after split?
if (ncol(b) < length(csqFormat)) b[,length(csqFormat)]=""
names(b)=csqFormat
b
}
getHGVS <- function(csqStringList) {
unlist(lapply(csqStringList, function(x) {y=splitCSQAnnotation(x); y$HGVSc}))
}
getHGVSFromExacFile <- function(exacVCF, transcript, geneName) {
exacData <- loadExacData(exacVCF, transcript, geneName)
# simple: just get list of HGVS codes for all ExAC variants - will reannotate later
ann <- lapply(exacData$CSQ, getHGVS)
unlist(lapply(ann, function(x) {idx=grep(transcript$id,x); x[idx]}))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.