genomicRTools: Genomic R Tools

#' unpackExacData
#'
#' Main entry point to unpack Exac vcf data. Unpacking is needed because multiallelics are hard to deal with,
#' so each alt allele is split into its separate row. Fields which are lists are converted to scalars.
#' FIXME! very slow, need to see if we can optimize
#' @param exacData Raw Exac data frame as returned from genomicRTools R package
#' @return Data frame with each row being a separate alt allele.
#' @export
#' @examples
#' exacData <- unpackExacData(exacData,'ENST0000323456')
unpackExacData <- function(exacData, transcriptID) {
  init=F
  # unpack multiallelic data and form nicely shaped data frame
  for (i in 1:nrow(exacData) ) {
    if (i %% 100 == 0) print(i)
    # expand multiallelics
    unpackedRow<-unpackVCFRow(exacData[i,],transcriptID)
    if (!init) {
      unpackedData=unpackedRow
      init=T
    } else unpackedData=rbind(unpackedData,unpackedRow)

  }

  return(unpackedData)
}

unpackVCFRow <- function(vcfRow, transcriptID) {
  #print(vcfRow)
  # unpack multiallelics into one row per alt allele
  fieldsToUnpack <- c("AC","AC_AFR","AC_AMR","AC_Adj","AC_EAS","AC_FIN","AC_Hemi","AC_Het","AC_Hom",
                      "AC_NFE","AC_OTH","AC_SAS","AF",
                      "Hemi_AFR","Hemi_AMR","Hemi_EAS","Hemi_FIN","Hemi_NFE","Hemi_OTH","Hemi_SAS",
                      "Het_AFR","Het_AMR","Het_EAS","Het_FIN","Het_NFE","Het_OTH","Het_SAS",
                      "Hom_AFR","Hom_AMR","Hom_EAS","Hom_FIN","Hom_NFE","Hom_OTH","Hom_SAS",
                      "MLEAC","MLEAF")

  df <- as.data.frame(vcfRow)
  # build DF with all fields except the ones we need to unpack and convert from list to scalar
  altIdx=which(names(vcfRow) %in% fieldsToUnpack)
  newRows <- df[,-altIdx]

  altAlleles<-as.character(vcfRow$ALT[[1]])
  # special treatment for CSQ value: keep only transcript ID passed in, match alt alleles:
  csq<- (Filter(function(x) length(grep(transcriptID,x))>0,df$CSQ[[1]])) # only keep annotations for given transcript ID
  #print(csq)
  csqList=lapply(csq, splitCSQAnnotation)

  annotationAlleles=as.character(unlist(lapply(csqList,function(x) x$Allele)))
  annotationAlleles[annotationAlleles=="-"]=""

  allAlleles <- c(newRows$REF,altAlleles)
  commonBases <- sapply(altAlleles,function(x,ref) commonSubstring(c(ref,x)), newRows$REF)

  for (i in 1:length(altAlleles)) {
    newDF<-newRows
    # change alt,
    newDF$ALT=altAlleles[[i]]
    # for all columns which are lists, pick element i
    for (fi in fieldsToUnpack) {
      valList=df[,fi]
      #print(valList)
      newDF[,fi]=valList[[1]][i]
    }
    if (length(csqList) >0) {
      if (length(altAlleles)== 1) {
        csqEl=csqList
      } else {
        commonBase <- commonBases[[which(names(commonBases) == newDF$ALT)]]
        #csqAllele <- paste0(commonBase,x$Allele)

        # pick elements d in csqList such that csqAllele matches
        csqEl<- (Filter(function(x) {
          d <- if(x$Allele=="-") commonBase else paste0(commonBase,x$Allele)
          d==newDF$ALT
        },csqList))
      }
      if (length(csqEl)==0) {
        newDF <- newDF[0,] # no matching annotation found
      }
      else {
        newDF<-cbind(newDF,(csqEl[[1]])[,-1])

      }
    } else {
      # workaround for empty csq list: happens if we don't have annotation for requested transcript id
      newDF <- newDF[0,]
    }
    result<- if (i==1) newDF else rbind(result,newDF)
  }

  return(result)
}

#' splitCSQAnnotation
#'
#' Helper function to split CSQ annotation in Exac's INFO field
#' @param csqString Raw CSQ string as read from INFO field
#' @return Data frame with each column being a field on CSQ. If annotation has multiple values (e.g. due to multi-allelic), each allele is a separate row
#' @export
#' @examples
#' csqAnnot<- splitCSQAnnotation('C|ENSG00000223972|ENST00000456328|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|620||||||1||1|DDX11L1|HGNC|37102|processed_transcript|YES||||||||3/3|||ENST00000456328.2:n.620G>C|||||||||||||||||||,C|ENSG00000223972|ENST00000450305|Transcript|splice_region_variant&non_coding_transcript_exon_variant&non_coding_transcript_variant|412||||||1||1|DDX11L1|HGNC|37102|transcribed_unprocessed_pseudogene|||||||||5/6|||ENST00000450305.2:n.412G>C|||||||||||||||||||')
splitCSQAnnotation <- function(csqString) {
  EXAC_VEP_FIELDS_31 <- 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|ASN_MAF|EAS_MAF|EUR_MAF|SAS_MAF|AA_MAF|EA_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|context|ancestral'
  csqFormat <- strsplit(EXAC_VEP_FIELDS_31,
                        '\\|')[[1]]
  # split CSQ string into data frame with names columns
  a=strsplit(csqString,"\\|")
  b=as.data.frame(t(unlist(a)))
  # FIXME? if last field was empty there is a column missing after split?
  if (ncol(b) < length(csqFormat)) b[,length(csqFormat)]=""
  names(b)=csqFormat
  b
}


getHGVS <- function(csqStringList) {
  unlist(lapply(csqStringList, function(x) {y=splitCSQAnnotation(x); y$HGVSc}))
}

getHGVSFromExacFile <- function(exacVCF, transcript, geneName) {
  exacData <- loadExacData(exacVCF, transcript, geneName)
  # simple: just get list of HGVS codes for all ExAC variants - will reannotate later
  ann <- lapply(exacData$CSQ, getHGVS)
  unlist(lapply(ann, function(x) {idx=grep(transcript$id,x); x[idx]}))
}