R/ensembl_rest.R

# functions to extract data from the ensembl REST API

# consequence list, as sorted at http://www.ensembl.org/info/genome/variation/predicted_data.html
consequences = c("transcript_ablation", "splice_donor_variant",
    "splice_acceptor_variant", "stop_gained", "frameshift_variant",
    "start_lost", "initiator_codon_variant", "stop_lost",
    "transcript_amplification", "inframe_insertion", "inframe_deletion",
    "missense_variant", "protein_altering_variant", "splice_region_variant",
    "incomplete_terminal_codon_variant", "stop_retained_variant",
    "synonymous_variant", "coding_sequence_variant", "mature_miRNA_variant",
    "5_prime_UTR_variant", "3_prime_UTR_variant",
    "non_coding_transcript_exon_variant", "intron_variant",
    "NMD_transcript_variant", "non_coding_transcript_variant",
    "upstream_gene_variant", "downstream_gene_variant", "TFBS_ablation",
    "TFBS_amplification", "TF_binding_site_variant",
    "regulatory_region_ablation", "regulatory_region_amplification",
    "regulatory_region_variant", "feature_elongation", "feature_truncation",
    "intergenic_variant")
severity = data.frame(consequence=consequences, rank=seq(1:length(consequences)),
    stringsAsFactors=FALSE)

timeEnv = new.env()
assign("initial_time", Sys.time(), envir=timeEnv)

#' make a URL request to the Ensembl service
#'
#' @param ext extension for URL
#' @param build genome build for REST API
#' @param headers headers for the url
#' @param tries number of attempts that have been mader to access the URL
#'
#' @export
#' @return a character string, typically json encoded
#'
#' @examples
#' request_from_ensembl("/vep/human/region/1:205901016:205901016/A")
request_from_ensembl <- function(ext, headers="", build="grch37", tries=0) {
    
    base_url = "rest.ensembl.org"
    # strip any possible whitespace from the url, since imported datasets
    # occasionally contain whitespace in the values used to construct the URL.
    ext = gsub(" ", "", ext)
    
    # only tolerate the grch37 and grch38 genome builds, since they are the only
    # genome builds supported by the Ensembl REST API
    allowed_builds = c("grch37", "grch38")
    stopifnot( tolower(build) %in% allowed_builds )
    if (build == "grch37") {
        base_url = paste("grch37", base_url, sep = ".")
    }
    
    content = paste("?", paste(headers, "content-type=application/json", sep=";"), sep="")
    url = paste("http://", base_url, ext, content, sep="")
    
    # check that we are not requesting urls fater than that allowed by Ensembl,
    # sleep until the period expires
    current_time = Sys.time()
    diff = 0.1 - as.numeric(current_time - get("initial_time", current_time, envir=timeEnv))
    Sys.sleep(max(diff, 0))
    
    # set the previous request time to that of the current request
    assign("initial_time", current_time, envir=timeEnv)
    
    # cut out after making 5 attempts to access the url
    tries = tries + 1
    stopifnot(tries <= 5)
    
    request = httr::GET(url)
    
    # handle the possible http request return status codes, such as when the
    # server is unavailable, when we have made too many requests, or requested
    # an impossible URL.
    if (request$status_code == 503) { # server down
        Sys.sleep(30)
        return(request_from_ensembl(ext, headers, build, tries))
    } else if (request$status_code == 429) { # too frequent requests
        reset_time = as.numeric(request$headers$`retry-after`)
        print(paste("rapid ensembl requests, waiting for", reset_time, "seconds", sep=" "))
        Sys.sleep(reset_time)
        return(request_from_ensembl(ext, headers, build, tries))
    } else if (request$status_code == 400) { # bad url
        msg = paste("bad url request: ", url, sep = "")
        stop(msg)
    } else if (request$status_code != 200) { # other status errors
        msg = paste("unknown status: ", request$status$code, " at: ", url, sep = "")
        stop(msg)
    }
    
    return(intToUtf8(request$content))
}

#' find the VEP consequence for a variant
#'
#' @param variant data frame or list for a variant, containing columns named
#'     "chrom", "start_pos", "end_pos", and "alt_allele" code for a single variant
#' @param include_hgvs whether to also include the HGVS consequence strings for
#'     cDNA and protein.
#' @param include_deleterious_score whether to also include SIFT and polyphen predictions
#' @param build genome build to find consequences on
#' @param verbose flag indicating whether to print variants as they are checked
#'
#' @export
#' @return a character string containing the most severe consequence, as per VEP
#'     annotation formats.
#'
#' @examples
#' get_vep_consequence(data.frame(chrom=c("1"),
#'     start_pos=c("1000000"), end_pos=c("1000000"), alt_allele=c("A"),
#'     ref_allele=c("G")))
#' get_vep_consequence(list(chrom="1", start_pos="1000000",
#'     end_pos="1000000", alt_allele="A", ref_allele="G"))
get_vep_consequence <- function(variant, include_hgvs=FALSE, include_deleterious_score=FALSE, build="grch37", verbose=FALSE) {
    
    # get the correct allele for deletions for the Ensembl REST API requests
    if (variant[["alt_allele"]] == "") {variant[["alt_allele"]] = "-"}
    
    # define parts of the URL
    ext = "/vep/human/region/"
    ext = paste(ext, variant[["chrom"]], ":", variant[["start_pos"]], ":", variant[["end_pos"]], "/", sep="")
    
    # define the URLs for the ref and alt alleles
    alt_ext = paste(ext, variant[["ref_allele"]], sep = "")
    ext = paste(ext, variant[["alt_allele"]], sep = "")
    
    headers = ""
    if (include_hgvs) {
        headers = "hgvs=1"
    }
    
    if (verbose) {
        print(paste("chr", variant[["chrom"]], ":", variant[["start_pos"]], " ",
            variant[["alt_allele"]], "    ", ext, sep=""))
    }
    
    json = try(request_from_ensembl(ext, headers=headers, build))
    if (class(json) == "try-error") {json = request_from_ensembl(alt_ext, headers=headers, build)}
    json = rjson::fromJSON(json)
    
    transcript = find_most_severe_transcript(json, include_hgvs)
    
    consequence = transcript$consequence_terms
    temp_severity = min(severity$rank[severity$consequence %in% consequence])
    consequence = severity$consequence[severity$rank == temp_severity]
    
    value = list(consequence=consequence, gene=transcript$gene_symbol)
    if (include_hgvs) {
        value$hgvsc = transcript$hgvsc
        value$hgvsp = transcript$hgvsp
    }
    
    if (include_deleterious_score) {
        if ("polyphen_prediction" %in% names(transcript)) {
            value$polyphen_prediction = transcript$polyphen_prediction
            value$polyphen_score = transcript$polyphen_score
            value$sift_prediction = transcript$sift_prediction
            value$sift_score = transcript$sift_score
        } else {
            value$polyphen_prediction = NA
            value$polyphen_score = NA
            value$sift_prediction = NA
            value$sift_score = NA
        }
    }
    
    return(value)
}

#' find the most severe transcript from Ensembl data
#'
#' @param ensembl_json json data for variant from Ensembl
#' @param include_hgvs boolean showing if we will be using HGVS predictions from
#'     the VEP consequence predictions. If true, we sort the trasncripts by their
#'     length, so that we consistently return the same transcript between
#'     different variants from the same gene. Otherwise we just return the first
#'     most severe transcript.
#' @param exclude_bad boolean to show whether we want to exclude nonsense
#'     mediated decay transcripts and so forth. We only supply a false value for
#'     recursive calling, when we have failed to find any transcript during
#'     normal usage.
#'
#' @export
#' @return a character string containing the most severe consequence, as per VEP
#'     annotation formats.
find_most_severe_transcript <- function(ensembl_json, include_hgvs=FALSE, exclude_bad=TRUE) {
    
    bad_transcripts = c("lincRNA", "nonsense_mediated_decay", "pseudogene",
        "transcribed_unprocessed_pseudogene")
    
    best_transcript = NA
    best_severity = NA
    symbol = NA
    
    # if we are dealing with an intergenic variant, the variant won't have any
    # transcript consequences, and so the function will enter an infinite
    # recursion. Instead return the json entry (appending the correct
    # annotations for the higher function)
    if (!("transcript_consequences" %in% names(ensembl_json[[1]]))) {
        nontranscript = ensembl_json[[1]]
        nontranscript$consequence_terms = nontranscript$most_severe_consequence
        nontranscript$gene_symbol = ""
        return(nontranscript)
    }
    
    transcripts = ensembl_json[[1]]$transcript_consequences
    # sort the transcripts by their protein lengths
    if (include_hgvs) {
        protein_lengths = sapply(transcripts, function(x) get_protein_length(x$transcript_id))
        protein_order = sort(protein_lengths, decreasing=TRUE, index.return=TRUE)
        transcripts = lapply(protein_order$ix, function(x) transcripts[[x]])
    }
    
    for (transcript in transcripts) {
        
        # don't bother to check some transcript types, depending on whether we
        if (exclude_bad & transcript$biotype %in% bad_transcripts) { next }
        
        # get consequence and severity rank in the current transcript
        consequence = transcript$consequence_terms
        temp_severity = min(severity$rank[severity$consequence %in% consequence])
        consequence = severity$consequence[severity$rank == temp_severity]
        
        # check if this is the most severe consequence; prefer HGNC transcripts
        if (is.na(best_severity) | temp_severity < best_severity |
                (symbol != "HGNC" & transcript$gene_symbol_source == "HGNC" &
                temp_severity == best_severity)) {
            best_severity = temp_severity
            best_transcript = transcript
            symbol = transcript$gene_symbol_source
        }
    }
    
    # sometimes we don't have any useable transcripts, in which case, select
    # any of the transcripts
    if (length(best_transcript) == 1) {
        if (is.na(best_transcript)) {
            best_transcript = find_most_severe_transcript(ensembl_json, exclude_bad=FALSE)
        }
    }
    
    return(best_transcript)
}

#' find the coding length for a ensembl transcript
#'
#' @param transcript_id json data for variant from Ensembl
#' @param build genome build to find consequences on
#' @param verbose flag indicating whether to print variants as they are checked
#'
#' @export
#' @return integer for CDS length.
get_protein_length <-function(transcript_id, build="grch37", verbose=FALSE) {
    ext = "/sequence/id/"
    ext = paste(ext, transcript_id, sep="")
    
    json = try(request_from_ensembl(ext, headers="type=protein", build), silent=TRUE)
    if (class(json) == "try-error") { return(0) }
    json = rjson::fromJSON(json)
    
    return(nchar(json$seq))
}

#' find the hgnc symbol overlapping a variant position
#'
#' @param variant data frame or list for a variant, containing columns named
#'     "chrom", "start_pos", and "end_pos" for a single variant
#' @param build genome build to find consequences on
#' @param verbose flag indicating whether to print variants as they are checked
#'
#' @export
#' @return a character string containing the HGNC symbol.
#'
#' @examples
#' get_gene_id_for_variant(data.frame(chrom=c("1"), start_pos=c("1000000"),
#'     end_pos=c("1000000")))
#' get_gene_id_for_variant(list(chrom="1", start_pos="1000000",
#'     end_pos="1000000"))
get_gene_id_for_variant <- function(variant, build="grch37", verbose=FALSE) {
    
    # make sure the end position is suitable for the Ensembl REST API request
    if (as.numeric(variant[["end_pos"]]) < as.numeric(variant[["start_pos"]])) {
        variant[["end_pos"]] = variant[["start_pos"]]
    }
    
    # define parts of the URL
    ext = "/overlap/region/human/"
    ext = paste(ext, variant[["chrom"]], ":", variant[["start_pos"]], ":", variant[["end_pos"]], "/", sep="")
    
    if (verbose) {
        print(paste("chr", variant[["chrom"]], ":", variant[["start_pos"]],
            "    ", ext, sep=""))
    }
    
    json = request_from_ensembl(ext, headers="feature=gene", build)
    json = rjson::fromJSON(json)
    
    # return blank string for variants not in genes
    if (length(json) > 0) {
        return(json[[1]]$external_name)
    }
    
    return("")
}

#' find genomic sequence within a region
#'
#' @param variant data frame or list for a variant, containing columns named
#'     "chrom", "start_pos", and "end_pos" for a single variant
#' @param build genome build to find consequences on
#' @param verbose flag indicating whether to print variants as they are checked
#'
#' @export
#' @return a character string containing the HGNC symbol.
#'
#' @examples
#' get_gene_id_for_variant(data.frame(chrom=c("1"), start_pos=c("1000000"),
#'     end_pos=c("1000000")))
#' get_gene_id_for_variant(list(chrom="1", start_pos="1000000",
#'     end_pos="1000000"))
get_sequence_in_region <- function(variant, build="grch37", verbose=FALSE) {
    
    # make sure the end position is suitable for the Ensembl REST API request
    if (as.numeric(variant[["end_pos"]]) < as.numeric(variant[["start_pos"]])) {
        variant[["end_pos"]] = variant[["start_pos"]]
    }
    
    # define the URL
    ext = "/sequence/region/human/"
    ext = paste(ext, variant[["chrom"]], ":", variant[["start_pos"]], ":",
        variant[["end_pos"]], ":1", sep="")
    
    if (verbose) {
        print(paste("chr", variant[["chrom"]], ":", variant[["start_pos"]],
            "    ", ext, sep=""))
    }
    
    json = request_from_ensembl(ext, build)
    json = rjson::fromJSON(json)
    
    # return blank string for variants not in genes
    if (length(json) > 0) {
        return(json$seq)
    }
    
    return("")
}

#' map coordiantes from one assembly to another
#'
#' @param variant data frame or list for a variant, containing columns named
#'     "chrom", "start_pos", and "end_pos" for a single variant
#' @param inbuild genome build to find consequences on
#' @param outbuild genome build to find consequences on
#' @param build build of REST server to use
#' @param verbose flag indicating whether to print variants as they are checked
#'
#' @export
#' @return a character string containing the HGNC symbol.
#'
#' @examples
#' map_coordinates(data.frame(chrom=c("1"), start_pos=c("1000000"),
#'     end_pos=c("1000000")))
#' map_coordinates(list(chrom="1", start_pos="1000000",
#'     end_pos="1000000"))
map_coordinates <- function(variant, inbuild='grch38', outbuild='grch37', build='grch37', verbose=FALSE) {
    
    # make sure the end position is suitable for the Ensembl REST API request
    if (as.numeric(variant[["end_pos"]]) < as.numeric(variant[["start_pos"]])) {
        variant[["end_pos"]] = variant[["start_pos"]]
    }
    
    # define the URL
    ext = paste("/map/human/", inbuild, "/", sep="")
    ext = paste(ext, variant[["chrom"]], ":", variant[["start_pos"]], "..",
        variant[["end_pos"]], ":1/", outbuild, sep="")
    
    if (verbose) {
        print(paste("chr", variant[["chrom"]], ":", variant[["start_pos"]],
            "    ", ext, sep=""))
    }
    
    json = request_from_ensembl(ext, build)
    json = rjson::fromJSON(json)
    
    # return blank string for variants not in genes
    mappings = json[['mappings']]
    if (length(mappings) > 0) {
        mapped = mappings[[1]][['mapped']]
        return(list('chrom'=mapped$seq_region_name, 'start_pos'=mapped$start,
            'end_pos'=mapped$end))
    }
    
    return(list('chrom'=variant[['chrom']], 'start_pos'=variant[['start_pos']],
        'end_pos'=variant[['end_pos']]))
}
jeremymcrae/publishedDeNovos documentation built on May 19, 2019, 5:08 a.m.