#' @title get_variants_in_genes
#' @description Returns all variants that are within a gene (defined by start/stop
#' in the reference dataset).
#' @param df data.frame with three columns: chr, bp and id.
#' @param reference a reference dataset that contains 6 columns:
#' ensembl_gene_id, ensembl_transcript_id, hgnc_symbol,
#' chromosome_name, start_position, end_position. This could, for
#' instance be retrieved by biomart.
#' @param window integer. How many bases should the window be extended upstrea/downstream of the gene.
#' @note Assumes genes are aligned to positive strand.
#' @author flassen
#' @export
get_variants_in_genes = function(df, reference, window = 0){
if (!is.numeric(window)) stop('arg window must be integer')
if (window < 0) warning('arg window is less than zero! Truncating genes at start/end.')
#require(data.table)
# df: two columns, chromosome, bpstart, variant id
# reference: data. frame with columns:
colnames(df) = c('chr','bp','id')
colnames(reference) = c("ensembl_gene_id", "ensembl_transcript_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position")
reference = setDT(reference)
n = nrow(df)
result = lapply(1:n, function(i){
# find overlap
overlap_reference = reference[reference$chromosome_name == df$chr[i] &
(reference$start_position - window) <= df$bp[i] &
(reference$end_position + window) >= df$bp[i], ]
# collapse into one single line
transcript_ids = paste(overlap_reference$ensembl_transcript_id, collapse = ';')
overlap_reference$ensembl_transcript_id = NULL
overlap_reference = overlap_reference[!duplicated(overlap_reference),]
overlap_reference$ensembl_transcript_id = transcript_ids
# append with variant ID
overlap_reference$variant_position = df$bp[i]
overlap_reference$variant_id = df$id[i]
return(overlap_reference)
})
merged = do.call(rbind, result)
return(merged)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.