R/tweak_lofs.R

Defines functions check_for_last_base_in_exon get_exon_ends get_gene_ids_for_hgnc get_transcript_ids_for_ensembl_gene_ids request_from_ensembl

Documented in check_for_last_base_in_exon get_exon_ends get_gene_ids_for_hgnc get_transcript_ids_for_ensembl_gene_ids request_from_ensembl

#' modifies the consequence where the variant is at the last base in the exon
#'
#' @param variant row of dataframe
#' @param exon_ends vector of exon end positions for the variants gene.
#' @param strand which chromosome strand the gene is on, 1 for plus strand, and
#'     -1 for minus strand.
#' @export
#'
#' @return consequence for variant.
check_for_last_base_in_exon <- function(variant, exon_ends, strand) {
    
    allowed = c("missense_variant", "synonymous_variant")
    
    # ignore variants where the consequence isn't missense or synonymous
    if (!variant[["CQ"]] %in% allowed) { return(variant[["CQ"]]) }
    
    # we don't modify variants that don't have G as reference allele (or C for
    # genes on - strand)
    required_base = "G"
    if (strand == -1) { required_base = "C" }
    if (variant[["REF"]] != required_base) { return(variant[["CQ"]]) }
    
    if (variant[["POS"]] %in% exon_ends) {
        variant[["CQ"]] = "splice_donor_variant"
        cat(paste(variant[["CHROM"]], ":", variant[["POS"]], " is non-LoF ref allele G at last base of exon\n", sep=""))
    }
    
    return(variant[["CQ"]])
}

#' gets all exon end positions for an HGNC symbol
#'
#' @param hgnc_symbol HGNC symbol for gene of interest.
#' @export
#'
#' @return list of vectors of exon end chromosome positions.
get_exon_ends <- function(hgnc_symbol) {
    base_url = "http://grch37.rest.ensembl.org/"
    
    gene_ids = get_gene_ids_for_hgnc(base_url, hgnc_symbol)
    transcript_ids = get_transcript_ids_for_ensembl_gene_ids(base_url, gene_ids, hgnc_symbol)
    
    all_ends = c()
    strand = 1
    for (transcript_id in transcript_ids) {
        ext = paste("/overlap/id/", transcript_id, "?feature=exon", sep="")
        url = paste(base_url, ext, sep="")
        json = request_from_ensembl(url)
        json = rjson::fromJSON(json)
        
        # restrict to the exons for the transcript
        exons = json[sapply(json, function(x) x[["Parent"]] == transcript_id)]
        
        strand = exons[[1]]$strand
        # get the final position of each exon
        ends = sapply(exons, function(x) x[["end"]])
        if (strand == -1) {
            ends = sapply(exons, function(x) x[["start"]])
        }
        
        all_ends = c(all_ends, ends)
    }
    
    # we only need a unique set of exon end positions, since we simply match
    # against these positions. Also include the strand, so we can check if the
    # base should be "G", or "C".
    values = list(all_ends=unique(all_ends), strand=strand)
    
    return(values)
}

#' gets the gene IDs for HGNC symbols
#'
#' @param base_url url for REST server.
#' @param hgnc_symbol HGNC symbol for gene of interest.
#' @export
#'
#' @return vector of ensembl gene IDs.
get_gene_ids_for_hgnc <- function(base_url, hgnc_symbol) {
    ext = paste("/xrefs/symbol/homo_sapiens/", hgnc_symbol, sep="")
    url = paste(base_url, ext, sep="")
    
    json = request_from_ensembl(url)
    json = rjson::fromJSON(json)
    
    genes = json[sapply(json, function(x) x[["type"]] == "gene")]
    gene_ids = sapply(genes, function(x) x[["id"]])
    
    return(gene_ids)
}

#' gets the transcript IDs for ensembl gene IDs
#'
#' @param base_url url for REST server.
#' @param gene_ids vector of gene IDs.
#' @param hgnc HGNC symbol for gene of interest.
#' @export
#'
#' @return vector of transcript IDs.
get_transcript_ids_for_ensembl_gene_ids <- function(base_url, gene_ids, hgnc) {
    chroms = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
        "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
        "X", "Y")
    hgnc_symbols = hgnc
    
    transcript_ids = c()
    for (gene_id in gene_ids) {
        ext = paste("/overlap/id/", gene_id, "?feature=transcript", sep="")
        url = paste(base_url, ext, sep="")
        json = request_from_ensembl(url)
        json = rjson::fromJSON(json)
        
        for (item in json) {
            # ignore transcripts not on the standard chromosomes
            # (non-default chroms fail to map the known de novo variants
            # to the gene location
            if (item[["Parent"]] != gene_id |
                !(item[["seq_region_name"]] %in% chroms) |
                all(!grepl(paste(hgnc_symbols, collapse="|"), item[["external_name"]]))) {
                next
            }
            
            transcript_ids = c(transcript_ids, item[["id"]])
        }
    }
    
    return(transcript_ids)
}

timeEnv = new.env()
assign("initial_time", Sys.time(), envir=timeEnv)

#' make a URL request to the Ensembl service
#'
#' @param url string of URL to be accessed
#' @param tries number of attempts that have been mader to access the URL
#'
#' @export
#' @return a character string, typically json encoded
request_from_ensembl <- function(url, tries=0) {
    
    # strip any possible whitespace from the url, since imported datasets
    # occasionally contain whitespace in the values used to construct the URL.
    url = gsub(" ", "", url)
    
    # check that we are not requesting urls fater than that allowed by Ensembl,
    # sleep until the period expires
    current_time = Sys.time()
    diff = 0.067 - as.numeric(current_time - get("initial_time", current_time, envir=timeEnv))
    if (diff > 0) { Sys.sleep(diff) }
    
    # set the previous request time to that of the current request
    assign("initial_time", current_time, envir=timeEnv)
    
    # cut out after making 5 attempts to access the url
    tries = tries + 1
    stopifnot(tries <= 5)
    
    request = httr::GET(url)
    
    # handle the possible http request return status codes, such as when the
    # server is unavailable, when we have made too many requests, or requested
    # an impossible URL.
    if (request$status_code == 503) { # server down
        Sys.sleep(30)
        return(request_from_ensembl(url, tries))
    } else if (request$status_code == 429) { # too frequent requests
        reset_time = as.numeric(request$headers$`x-ratelimit-reset`)
        Sys.sleep(reset_time)
        return(request_from_ensembl(url, tries))
    } else if (request$status_code == 400) { # bad url
        msg = paste("bad url request: ", url, sep = "")
        stop(msg)
    } else if (request$status_code != 200) { # other status errors
        msg = paste("unknown status: ", request$status_code, " at: ", url, sep = "")
        stop(msg)
    }
    
    return(intToUtf8(request$content))
}
jeremymcrae/recessiveStats documentation built on May 19, 2019, 5:08 a.m.