hgvsParseR: Parse HGVS variant descriptor strings

Documented in translateHGVS

# Copyright (C) 2018  Jochen Weile, Roth Lab
#
# This file is part of hgvsParseR.
#
# hgvsParseR is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# hgvsParseR is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with hgvsParseR.  If not, see <https://www.gnu.org/licenses/>.


#' Translate a single HGVS string from nucleotide to amino acid level
#' 
#' This function takes a codon-level HGVS string and translates it to amino acid level.
#' It does so in two different ways: A joint view, that describes the overall effect of all mutations
#' on the protein, as well as a codon-wise segmented way that describes changes on individual 
#' codons/amino-acids separately, so that they are more easily separable for Marginal Frequency analysis
#' 
#' @param hgvs a single cds-level HGVS string. May contain multiple mutations in-cis.
#' @param cdsSeq the DNA coding sequence
#' @param builder an optional protein HGVS builder object, if none is provided, a new one is created.
#' @param strictMode whether HGVS will be validated against the reference sequence before translation
#' @return a named vector containing the following strings: hgvsp: the translated HGVS string; 
#'   codonChanges: A simple string expression of the involved codon changes; codonHGVS: A codon-wise
#'   segmented HGVS string; aaChanges: A simple amino acid change string; and aaChangeHGVS: A codon-wise
#'   segmented HGVS string.
#' @export
translateHGVS <- function(hgvs, cdsSeq, 
	builder=new.hgvs.builder.p(aacode=3),cbuilder=new.hgvs.builder.c(),
	strictMode=TRUE) {

	library(yogitools)

	if (!inherits(hgvs,"character") && length(hgvs) != 1) {
		stop("Input must be single HGVS string!")
	}
	if (!grepl("^c\\.",hgvs)) {
		stop("HGVS string must be at CDS level!")
	}

	#extract coding sequence and break into codons
	# cdsSeq <- params$template$cdsSeq
	cdsLen <- nchar(cdsSeq)
	if (cdsLen%%3 != 0) {
		stop("Invalid CDS sequence: Length must be factor of 3!")
	}
	if (!grepl("^[ACGT]{3,}$",cdsSeq)) {
		stop("CDS sequence is not a valid DNA sequence!")
	}
	if (!grepl("^ATG",cdsSeq)) {
		warning("CDS sequence has no ATG start codon!")
	}
	codonStarts <- seq(1,cdsLen,3)
	# codonIndices <- sapply(codonStarts, function(i) c(i,i+1,i+2))
	codons <- sapply(codonStarts,function(i) substr(cdsSeq,i,i+2))
	# codons <- params$template$cdsCodons

	#load DNA translation table
	data(trtable)

	#parse the DNA HGVS string
	breakdown <- parseHGVS(hgvs)
	#not every breakdown will have an end column, but we rely on it below
	#so we add a fake one if it's missing.
	if (!("end" %in% colnames(breakdown))) {
		breakdown$end <- NA
	}
	if (!("variant" %in% colnames(breakdown))) {
	  breakdown$variant <- NA
	}

	#check for unsupported mutation types
	unsupportedTypes <- c("duplication","inversion","conversion","amplification","invalid")
	if (any(breakdown$type %in% unsupportedTypes)) {
		offenders <- with(breakdown,unique(type[which(type %in% unsupportedTypes)]))
		stop("Unsupported variant type(s): ",paste(offenders, collapse=","), " in ", hgvs)
	}

	#check for any cases that are reported backwards and correct them
	if (with(breakdown,any(start > end,na.rm=TRUE))) {
	  culprits <- with(breakdown,which(start > end))
	  for (culprit in culprits) {
	    newStart <- breakdown[culprit,"end"]
	    breakdown[culprit,"end"] <- breakdown[culprit,"start"]
	    breakdown[culprit,"start"] <- newStart
	  }
	}
	
	####################################
	# CHECK FOR VARIANTS OUT OF BOUNDS #
	####################################
	oob <- sapply(1:nrow(breakdown), function(i) with(breakdown[i,],{
	  any(c(
	    type=="insertion" && (end < 2 || start > cdsLen-1),
	    type %in% c("deletion","delins") && (end < 1 || start > cdsLen),
	    type %in% c("substitution","singledeletion") && (start < 1 || start > cdsLen)
	  ))
	}))
	#remove out-of-bounds variants from consideration
	if (any(oob)) {
	  breakdown <- breakdown[!oob,]
	}
	#also, if everything is out of bounds, we're done here
	if (all(oob)) {
	  return(c(
	    hgvsc=hgvs,hgvsp="p.=",
	    codonChanges="silent",codonHGVS="c.=",
	    aaChanges="silent",aaChangeHGVS="p.="
	  ))
	}
	
	#########################################
	# CHECK FOR MERGEABLE ADJACENT VARIANTS #
	#########################################
	if (nrow(breakdown) > 1) {
	  #sort by start position
	  breakdown <- breakdown[order(breakdown$start),]
	  #define replacement ranges
	  ranges <- do.call(rbind,lapply(1:nrow(breakdown), function(i) {
	    s <- breakdown[i,"start"]
	    e <- breakdown[i,"end"]
	    switch(breakdown[i,"type"],
         substitution = c(start=s,end=s),
         deletion = c(start=s,end=e),
         singledeletion = c(start=s,end=s),
         #yes, insertions end before they begin
         #because they don't replace anything
         insertion = c(start=e,end=s),
         delins = c(start=s,end=e),
         stop("Unsupported type!")
	    )
	  }))
	  #check which variants are _not_ adjacent
	  maxDist <- 3
	  nadj <- which(sapply(2:nrow(ranges),function(i){
	    ranges[i,"start"]-ranges[i-1,"end"]
	  }) > maxDist)
	  #and use that to calculate "runs" of variants
	  runs <- cbind(start=c(1,nadj+1),end=c(nadj,nrow(ranges)))
	  
	  #if there are indeed clustered runs, then condense them
	  if (nrow(runs) < nrow(breakdown)) {
	    #construct the condensed component variants per cluster
  	  components <- sapply(1:nrow(runs), function(i) {
  	    idx <- runs[i,1]:runs[i,2]
  	    # start <- min(breakdown[idx,"start"],na.rm=TRUE)
  	    start <- min(ranges[idx,"start"])
  	    # end <- max(c(start,breakdown[idx,"end"]),na.rm=TRUE)
  	    end <- max(ranges[idx,"end"])
  	    
  	    na2empty <- function(x) if (is.na(x)) "" else x
  	    
  	    insSeq <- na2empty(breakdown[idx[[1]],"variant"])
  	    if (length(idx) > 1) {
  	      for (j in 2:length(idx)) {
  	        idxj <- idx[[j]]
  	        idxk <- idx[[j-1]]
  	        if (ranges[idxj,"start"]-ranges[idxk,"end"] > 1) {
  	          skippedBases <- substr(cdsSeq,ranges[idxk,"end"]+1,ranges[idxj,"start"]-1)
  	          insSeq <- paste0(insSeq,skippedBases)
  	        }
  	        insJ <- na2empty(breakdown[idxj,"variant"])
  	        insSeq <- paste0(insSeq,insJ)
  	      }
  	    }
  	    
  	    if (is.na(insSeq) || insSeq == "") {#deletion
  	      cbuilder$deletion(start=start,stop=end)
  	    } else if (end-start==0 && nchar(insSeq)==1) {#substituion
    	      anc <- unique(breakdown[idx,"ancestral"])
    	      if (is.na(anc)) {
    	        anc <- substr(cdsSeq,start,start)
    	      }
    	      cbuilder$substitution(pos=start,ancestral=anc,variant=insSeq)
  	     } else if (end-start==-1) {#insertion
  	        cbuilder$insertion(start=end,seq=insSeq)
  	    } else {#delins
  	      cbuilder$delins(start=start,stop=end,seq=insSeq)
  	    }
  	  })
  	  
  	  if (length(components) > 1) {
  	    hgvs <- cbuilder$cis(components)
  	  } else {
  	    hgvs <- components
  	  }
  	  
  	  breakdown <- parseHGVS(hgvs)
  	  if (!("end" %in% colnames(breakdown))) {
  	    breakdown$end <- NA
  	  }
  	  if (!("variant" %in% colnames(breakdown))) {
  	    breakdown$variant <- NA
  	  }
	  }
	}
	
	#################################
	# CHECK FOR 5'/3'-UTR MUTATIONS #
	#################################
	
	isUTR <- sapply(1:nrow(breakdown), function(i) with(breakdown[i,],{
	  start >= cdsLen || (!is.na(end) && end <= 1) || (is.na(end) && start < 1)
	}))
	utrVars <- which(isUTR)

	#########################
	# CHECK FOR FRAMESHIFTS #
	#########################
  fsTypes <- c("singledeletion","deletion","insertion","delins")
	fsCandidates <- which((breakdown$type %in% fsTypes) & !isUTR)
	if (length(fsCandidates) > 0) {
		inFrame <- sapply(fsCandidates,function(i) {
			switch(
				breakdown[i,"type"],
				singledeletion = FALSE,
				deletion = {
					with(breakdown[i,],(end-start+1)%%3==0)
				},
				insertion = {
					nchar(breakdown[i,"variant"])%%3==0
				},
				delins = {
					with(breakdown[i,],(nchar(breakdown[i,"variant"])-(end-start+1))%%3==0)
				}
			)
		})
		frameshifts <- fsCandidates[!inFrame]
	} else {
		frameshifts <- integer(0)
	}

	#if any frameshifts were found, describe the first of them and we're done.
	if (length(frameshifts) > 0) {
		first <- frameshifts[[which.min(breakdown[frameshifts,"start"])]]
		fsStart <- breakdown[first,"start"]
		codonIdx <- (fsStart -1) %/%3 + 1
		aa <- trtable[[codons[[codonIdx]]]]
		fsout <- builder$frameshift(codonIdx,aa)
		fsSimple <- paste0(aa,codonIdx,"fs")
		fsCodon <- paste0(codons[[codonIdx]],codonIdx,"indel")
		return(c(
		  hgvsc=hgvs,hgvsp=fsout,
		  codonChanges=fsCodon,codonHGVS=breakdown[first,"hgvs"],
			aaChanges=fsSimple,aaChangeHGVS=fsout
		))
	}

	# Note: Occasionally there can be a frameshift that is rescued by a complementary indel further
	# downstream. But detecting those cases would require simulating the entire coding sequence. So
	# we're not bothering to deal with that here in the interest of compute time (and coding time).

	# Either way, if there are no frameshifts, then we're only just getting started...

	#############################
	# DETERMINE AFFECTED CODONS #
	#############################

	#list affected codons of each mutation to identify potential overlap
	affectedCodons <- lapply(1:nrow(breakdown),function(i) {
	  if (isUTR[[i]]) {
	    return(integer(0))
	  }
		ncs <- if (!is.na(breakdown[i,"end"])) {   
			seq(breakdown[i,"start"],breakdown[i,"end"])
		} else {
			breakdown[i,"start"]
		}
		afpos <- unique((ncs -1) %/%3 + 1)
		return(afpos[afpos <= length(codons) & afpos > 0])
	})

	#If the affected Codons include the stop codon, it's a stop-loss!
	if (length(codons) %in% Reduce(union,affectedCodons)) {
		culprit <- which(sapply(affectedCodons, function(cs) length(codons) %in% cs))
		fsStart <- breakdown[culprit,"start"]
		codonIdx <- (fsStart -1) %/%3 + 1		
		aa <- trtable[[codons[[codonIdx]]]]
		fsout <- builder$frameshift(codonIdx,aa)
		fsSimple <- paste0(aa,codonIdx,"fs")
		fsCodon <- paste0(codons[[codonIdx]],codonIdx,"indel")
		return(c(
		  hgvsc=hgvs,hgvsp=fsout,
		  codonChanges=fsCodon,codonHGVS=breakdown[culprit,"hgvs"],
			aaChanges=fsSimple,aaChangeHGVS=fsout
		))
	}

	########################################
	# APPLY CHANGES TO EACH AFFECTED CODON #
	########################################

	#calculate cumulative AA changes by iterating over each codon affected at least once
	aaChanges <- lapply(Reduce(union,affectedCodons), function(codonIdx) {
		#extract relevant codon sequence
		codon <- codons[[codonIdx]]
		#make a copy of the unchanged codon for reference matching purposes
		originalCodon <- codon
		cStart <- (codonIdx-1)*3+1
		#pull up the relevant mutation entries that affect this codon
		relevantMuts <- which(sapply(affectedCodons,function(l) codonIdx %in% l))
		#iterate over the relevant entries and cumulatively apply them to the codon
		for (mi in relevantMuts) {
		  #FIXME: This checks for a prior deletion interfering with the frame
		  #of the current codon. If this happens, the variant is skipped for now
		  if (codon == "") {
		    warning("Cross-codon deletion interference: ",hgvs)
		    next
		  }
			#calculate start and end indices with respect to whole CDS and codon
			mStart <- breakdown[mi,"start"]
			mEnd <- breakdown[mi,"end"]
			mStartInner <- (mStart-1)%%3+1
			mEndInner <- (mEnd-1)%%3+1
			insSeq <- breakdown[mi,"variant"]
			switch(
				breakdown[mi,"type"],
				substitution={
					if (strictMode) {
						wtNc <- breakdown[mi,"ancestral"]
						if (substr(originalCodon,mStartInner,mStartInner) != wtNc) {
							stop("Reference mismatch!")
						}
					}
					substr(codon,mStartInner,mStartInner) <- insSeq
				},
				#Reminder: deletion / insertion lengths can only be multiple of 3
				#since they would have been caught in the frameshift check otherwise.
				deletion={
					#does the deletion leave a codon prefix at the beginning?
					if (mStart > cStart) {
						#FIXME: What if another mutation changes the downstream sequence?
						preSeq <- substr(codon,1,mStartInner-1)
						sufLen <- 3-nchar(preSeq)
						sufSeq <- substr(cdsSeq,mEnd+1,mEnd+sufLen)
						codon <- paste0(preSeq,sufSeq)
					} else {
						#otherwise it either covers the whole codon, 
						#or it has already been fused with an upstream codon
						#so we must delete this codon
						codon <- ""
					}
				},
				insertion={
					#double-check that start and end are right next to each other
					if (mEnd != mStart+1) {
						stop("Invalid insertion point!")
					}
				  if (mStart == cStart-1) {#insertion is _before_ codon
				    #do nothing! the insertion was already handled in 
				    #the previous codon
				  } else if (mStart == cStart+2) {#insertion is _after_ codon
				    codon <- paste0(codon,insSeq)
				  } else {#insertion is _inside_ codon
				    preSeq <- substr(codon,1,mStartInner)
				    sufSeq <- substr(codon,mEndInner,3) 
				    #this is now actually more than just one codon!
				    codon <- paste0(preSeq,insSeq,sufSeq)
				  }
				},
				delins={
					#check if the insertion is shorter than the deletion
					if (nchar(insSeq) < (mEnd-mStart+1)) {
						#if we're in the first codon and are starting in the middle of it
						preSeq <- if (mStart > cStart && mStart < cStart+3) {
							substr(codon,1,mStartInner-1)
						} else {
							""
						}
						#how much of the insertion sequence has already been used?
						used <- cStart - mStart
						#how much do we need to still fill this codon?
						needed <- 3-nchar(preSeq)
						midSeq <- substr(insSeq,used+1,used+needed)
						#if we ran out of insertion sequence, make up the rest from downstream CDS
						sufLen <- needed-nchar(midSeq)
						sufSeq <- substr(cdsSeq,mEnd+1,mEnd+sufLen)
						#if there is no insertion sequence for this codon, it's been entirely deleted.
						#or previously used up in a suffix.
						if (midSeq=="") {
							codon <- ""
						} else {
							codon <- paste0(preSeq,midSeq,sufSeq)
						}
					#otherwise the insertion is longer than (or equal to) the deletion
					} else {
						#check if we're starting mid-codon
						if (mStart > cStart) {
							preSeq <- substr(codon,1,mStartInner-1)
							sufLen <- 3-nchar(preSeq)
							#if the whole delins takes part within the same codon
							if (mEnd <= cStart+2) {
							  postSeq <- substr(codon,mEndInner+1,nchar(codon))
							  codon <- paste0(preSeq,insSeq,postSeq)
							} else {#otherwise we're reaching into subsequent codons
  							sufSeq <- substr(insSeq,1,sufLen)
  							codon <- paste0(preSeq,sufSeq)
							}
						} else if (mEnd > cStart+2) {
							#if this is not the last codon in the replacement area, we replace everything
							offset <- cStart-mStart
							codon <- substr(insSeq,offset+1,offset+3)
						} else {
							#this is the last codon, which may have a suffix, as well as leftover insertion sequence
							sufSeq <- substring(codon,mEndInner+1,3)
							offset <- cStart-mStart
							codon <- paste0(substr(insSeq,offset+1,nchar(insSeq)),sufSeq)
						}
					}
				},
				{#any other type
					stop("Unsupported HGVS type:",breakdown[mi,"type"])
				}
			)
		}
		wtcodon <- codons[[codonIdx]]
		wtaa <- trtable[[wtcodon]]
		mutaa <- if (codon == "") {
			"-"
		} else if (nchar(codon) > 3) {
			paste(sapply(seq(1,nchar(codon),3),function(i) trtable[[substr(codon,i,i+2)]]),collapse="")
		} else {
			trtable[[codon]]
		}
		list(pos=codonIdx,wtaa=wtaa,mutaa=mutaa,wtcodon=wtcodon,mutcodon=codon)
	})
	
	#due to all UTR mutations (see above), the list may be empty
	if (length(aaChanges) == 0) {
	  return(c(
	    hgvsc=hgvs,hgvsp="p.=",
	    codonChanges="silent",codonHGVS="c.=",
	    aaChanges="silent",aaChangeHGVS="p.="
	  ))
	}
	
  #turn into data.frame
  aaChanges <- as.df(aaChanges)
  #sort aa changes by position
  aaChanges <- aaChanges[order(aaChanges$pos),]

	#Some in-frame indels can result in codons being re-constituted into equivalents of themselves.
	# e.g deleting 'CCA' from TCCACC results in TCC--- . While these are technically correct calls, 
	# They are not useful in terms of protein consequences, so we filter them out in strict mode
	if (strictMode && any(aaChanges$wtcodon == aaChanges$mutcodon)) {
		aaChanges <- aaChanges[-which(aaChanges$wtcodon == aaChanges$mutcodon),]
	}

	#######################################################
	# Build codon-centric change strings for later output #
	#######################################################
	#Helper function to build codon-specific HGVS
	codonWiseHGVS <- function(wt,pos,mut) {
		cstart <- pos*3-2
		diffs <- sapply(1:3,function(i)substr(wt,i,i)!=substr(mut,i,i))
		ndiff <- sum(diffs)
		if (mut == "") {
			return(cbuilder$deletion(cstart,cstart+2))
		} else if (nchar(mut) == 3 && ndiff == 1) { #SNV
			offset <- which(diffs)
			wtbase <- substr(wt,offset,offset)
			mutbase <- substr(mut,offset,offset)
			snvpos <- cstart+offset-1
			return(cbuilder$substitution(snvpos,wtbase,mutbase))
		} else { 
			# return(cbuilder$delins(cstart,cstart+nchar(mut)-1,mut))
		  return(cbuilder$delins(cstart,cstart+2,mut))
		}
	}
	#Helper function to build single-AA specific HGVS
	aaWiseHGVS <- function(wt,pos,mut) {
		if(mut=="-") {
			return(builder$deletion(pos,wt,pos,wt))
		} else if (nchar(mut) > 1) {
			return(builder$delins(pos,wt,pos,wt,toChars(mut)))
		} else if (wt==mut) {
			return(builder$synonymous(pos,wt))
		} else {
			return(builder$substitution(pos,wt,mut))
		}
	}
	#simple codon change description string
	codonChangeStr <- paste(with(aaChanges,paste0(wtcodon,pos,mutcodon)),collapse="|")
	#HGVS-compliant codon change description string
	codonChangeHGVS <- if (nrow(aaChanges) > 1) {
		do.call(cbuilder$cis,with(aaChanges,mapply(codonWiseHGVS,wtcodon,pos,mutcodon,SIMPLIFY=FALSE)))
	} else {
		with(aaChanges,codonWiseHGVS(wtcodon,pos,mutcodon))
	}
	#simple AA change description string
	aaChangeStr <- paste(with(aaChanges,paste0(wtaa,pos,mutaa)),collapse="|")
	#HGVS-compliant AA change description string
	aaChangeHGVS <- if (nrow(aaChanges) > 1) {
		do.call(builder$cis,with(aaChanges,mapply(aaWiseHGVS,wtaa,pos,mutaa,SIMPLIFY=FALSE)))
	} else {
		with(aaChanges, aaWiseHGVS(wtaa,pos,mutaa))
	}

	##############################
	# BUILD OVERALL PROTEIN HGVS #
	##############################

	#Check for runs in the amino acid changes and group them accordingly
	if (nrow(aaChanges) > 1) {
		#chainHeads are mutations which are not immediately to the right of another one
		chainHeads <- which(c(TRUE,sapply(2:nrow(aaChanges),function(i) aaChanges$pos[[i]]-aaChanges$pos[[i-1]] > 1)))
		#find the head for each chain member
		assignedHead <- sapply(1:nrow(aaChanges),function(i) chainHeads[[max(which(chainHeads <= i))]])
		#break down the table according to chain assignment
		aaChangeGroups <- tapply(1:nrow(aaChanges),assignedHead,function(is) aaChanges[is,])
	} else {
		#if there is only one mutation, it's all one group :P
		aaChangeGroups <- list(aaChanges)
	}

	#iterate over groups and build HGVS strings
	aaHGVS <- lapply(aaChangeGroups, function(acs) {
		if (nrow(acs) > 1) {
			#this is a chain
			insSeq <- gsub("-","",paste0(acs$mutaa,collapse=""))
			fromPos <- acs$pos[[1]]
			fromAA <- acs$wtaa[[1]]
			toPos <- acs$pos[[nrow(acs)]]
			toAA <- acs$wtaa[[nrow(acs)]]
			#if it's all "-" it's a deletion
			if (insSeq == "") {
				builder$deletion(fromPos,fromAA,toPos,toAA)
			} else {
				builder$delins(fromPos,fromAA,toPos,toAA,toChars(insSeq))
			}
		} else {
			#singletons can be synonymous, deletions, delins or substitutions (which include nonsense)
			with(acs,{
				if (mutaa == wtaa) {
					builder$synonymous(pos,wtaa)
				} else if (mutaa == "-") {
					builder$deletion(pos,wtaa,pos,wtaa)
				} else if (nchar(mutaa) > 1) {
					builder$delins(pos,wtaa,pos,wtaa,toChars(mutaa))
				} else {
					builder$substitution(pos,wtaa,mutaa)
				}
			})
			
		}
	})

	#collapse in-cis multi-mutants
	finalHGVS <- if (length(aaHGVS) > 1) {
		do.call(builder$cis,aaHGVS)
	} else {
		aaHGVS[[1]]
	}

	#and finally, return the result
	return(c(
	  hgvsc=hgvs,hgvsp=finalHGVS,
		codonChanges=codonChangeStr,codonHGVS=codonChangeHGVS,
		aaChanges=aaChangeStr,aaChangeHGVS=aaChangeHGVS
	))

}