R/get_vep.R

#' request_from_ensembl
#'
#' Internal Function to send request to Ensembl REST API
#' @param url Full URL to send to Ensembl
#' @return JSON object with full request
#' @keywords cats
#' @export
#' @import httr
#' @examples
#' seq<- request_from_ensembl(url)

request_from_ensembl <- function(url, tries=0) {
  #assign("initial_time", Sys.time(), envir=timeEnv)#' make a URL request to the Ensembl service
  initial_time = Sys.time()
  # strip any possible whitespace from the url, since imported datasets
  # occasionally contain whitespace in the values used to construct the URL.
  url = gsub(" ", "", url)

  # check that we are not requesting urls fater than that allowed by Ensembl,
  # sleep until the period expires
  current_time = Sys.time()
  diff = 0.05 - as.numeric(current_time - initial_time)
  if (diff > 0) { Sys.sleep(diff) }

  # set the previous request time to that of the current request
  #assign("initial_time", current_time, envir=timeEnv)

  # cut out after making 5 attempts to access the url
  tries = tries + 1
  stopifnot(tries <= 5)

  request = httr::GET(url)

  # handle the possible http request return status codes, such as when the
  # server is unavailable, when we have made too many requests, or requested
  # an impossible URL.
  if (request$status_code == 503) { # server down
    Sys.sleep(30)
    return(request_from_ensembl(url, tries))
  } else if (request$status_code == 429) { # too frequent requests
    reset_time = as.numeric(request$headers$`x-ratelimit-reset`)
    Sys.sleep(reset_time)
    return(request_from_ensembl(url, tries))
  } else if (request$status_code == 400) { # bad url
    msg = paste("bad url request: ", url, sep = "")
    return(NA)
  } else if (request$status_code != 200) { # other status errors
    msg = paste("unknown status: ", request$status_code, " at: ", url, sep = "")
    stop(msg)
  }

  return(intToUtf8(request$content))
}

#' getGenomicData
#'
#' Function to get raw genomic data from Ensembl REST API
#' @param tag ENST tag to get sequence for
#' @return list with elements startcoord, endCoord and seq=sequence
#' @keywords cats
#' @export
#' @examples
#' seq<- getGenomicData('ENST00000343536')
getGenomicData <- function(tag) {
  # construct the VEP REST url for the variant
  base_url = "http://grch37.rest.ensembl.org/"
  ext = "sequence/id/"
  ext = paste(ext, tag, sep="")
  url = paste(base_url, ext,"?type=genomic")

  # get the VEP data for the variant
  request = request_from_ensembl(url)
  #print (request)
  result <- rjson::fromJSON(request)
  desc <- strsplit(result$desc,":")[[1]]
  seq <- result$seq
  startCoord <- desc[4]
  stopCoord <- desc[5]
  return(list(startCoord=startCoord,stopCoord=stopCoord,seq=seq))
}

#' getCDNASequence
#'
#' Function to get raw cDNA (CDS) from Ensembl REST API
#' @param tag ENST tag to get sequence for
#' @return List containing raw reply
#' @keywords cats
#' @export
#' @examples
#' seq<- getCDNASequence('ENST00000343536')
getCDNASequence <- function(tag) {
  # construct the VEP REST url for the variant
  base_url = "http://grch37.rest.ensembl.org/"
  ext = "sequence/id/"
  ext = paste(ext, tag, sep="")
  url = paste(base_url, ext,"?type=cds")

  # get the VEP data for the variant
  request = request_from_ensembl(url)
  #print (request)
  result <- rjson::fromJSON(request)$seq
  result
}

#' getVEPFromHGVS
#'
#' Function to get raw cDNA (CDS) from Ensembl REST API
#' @param hgvsTag Character string with mutation HGVS Code
#' @param tag Character string with ENST tag to get sequence for
#' @param idIncludedInTag Does hgvsTag have form ENSTxxx:c.156T>A or just the hgvs code?
#' @param args Extra args to pass to VEP API request.
#' @param includeOnlyCanonicalTranscript Only output canonical transcript data. default to FALSE
#' @param outputDataFrame Flag to indicate whether data frame or list is output. Defaults to FALSE
#' @param verbose Output verbose data for debugging
#' @return Request from VEP API
#' @keywords cats
#' @export
#' @examples
#' seq <- getVEPFromHGVS('c.1559delT','ENST00000343536')
getVEPFromHGVS <- function(hgvsTag, transcriptID, idIncludedInTag=F,
                           args="canonical=1", includeOnlyCanonicalTranscript=F,
                           outputDataFrame=F, verbose=F) {
  if (idIncludedInTag) tag=as.character(hgvsTag) else tag=paste0(transcriptID,":",as.character(hgvsTag))
  if (verbose) print(paste0("Requesting VEP API:", tag))
  url = sprintf('http://grch37.rest.ensembl.org/vep/human/hgvs/%s?%s',tag,args)
  request = request_from_ensembl(url)
  if (is.na(request)) {
    warning(paste0("Bad request:",url))
    return(NA)
  } else {
    r <- rjson::fromJSON(request)
    if (length(r)==0) {
      warning(paste0("Empty reply received:",url))
      return(NA)
    }
    reply <- r[[1]]
    if (!('transcript_consequences' %in% names(reply) )) {
      warning(paste0("Incomplete reply received:",url))
      return(NA)
    }


    # flatten lists to make it easier to parse downsstream if required
    if (includeOnlyCanonicalTranscript) {
      tc<- (Filter(function(x) x$transcript_id==transcriptID, reply$transcript_consequences)) # only keep annotations for given transcript ID
      reply$transcript_consequences<-tc[[1]]
    }
    if (outputDataFrame) reply <- annovarFormatFromVEP(reply)
    return(reply)
  }
}

annovarFormatFromVEP <- function(vepReply) {

  chrom = vepReply$seq_region_name
  pstart = vepReply$start
  pstop = vepReply$end
  if (pstop < pstart) {
    pstart = pstop
  }

  alstr <- vepReply$allele_string
  a<-strsplit(x = alstr,split="/")
  ref <- a[[1]][1]
  alt <- a[[1]][2]

  if (vepReply$transcript_consequences$strand == -1 ) {
    ref = as.character(reverseComplement(DNAString((ref))))
    alt = as.character(reverseComplement(DNAString((alt))))
  }


  outdata <- data.frame(Chrom=as.character(vepReply$seq_region_name),
                        Start=as.numeric(pstart),
                        Stop=as.numeric(pstop),
                        Reference=as.character(ref),
                        Alternate=as.character(alt))

  outdata <- cbind(outdata,data.frame(t(unlist(vepReply))))
}

#' getCanonicalTranscript
#'
#' Function to get transcript object for a given gene from Ensembl REST API
#' @param tag ENST tag to get sequence for
#' @return List containing only transcript elements. Most important field: transcript$id gives corresponding ENST tag
#' @keywords cats
#' @export
#' @examples
#' transcript <- getCanonicalTranscript('CFTR')
getCanonicalTranscript <- function(geneName) {

  url = sprintf('http://rest.ensembl.org/xrefs/symbol/homo_sapiens/%s?',geneName)
  request = request_from_ensembl(url)
  if (is.na(request)) {
    warning(paste0("Bad request:",url))
    return(NA)
  } else {
    request = rjson::fromJSON(request)[[1]]

    if (request$type == "gene") {
      geneID <- request$id
      url = sprintf('http://grch37.rest.ensembl.org/lookup/id/%s?expand=1',geneID)
      #print(url)
      # get the VEP data for the variant
      request = request_from_ensembl(url)
      if (is.na(request)) {
        warning(paste0("Bad request:",url))
        return(NA)
      } else {
        request = rjson::fromJSON(request)
        canonicalTranscript <- Filter(function(x) x$is_canonical>0, request$Transcript)[[1]]
        return(canonicalTranscript)
      }
      return(request)
    } else {
      warning(paste0("Unexpected object:",request))
      return(NA)
    }
  }

}
gdelangel/genomicRTools documentation built on May 16, 2019, 11:14 p.m.