# ---- vargen Pipeline ----
#' @title Download the files needed to run \code{\link{vargen_pipeline}}
#' @description Only need to run it once. Will download the following files in
#' "install_dir":
#' \itemize{
#' \item the latest gwas catalog, eg: gwas_catalog_v1.0.2-associations_e93_r2019-01-11.tsv
#' \item hg19ToHg38.over.chain.gz (will be unzipped)
#' \item GTEx_Analysis_v8_eQTL.tar.gz (will be untared)
#' \item GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
#' \item enhancer_tss_associations.bed
#' }
#'
#'
#' @param install_dir the path to the installation folder (default = "./")
#' @param gtex_version the version of gtex to download, only "v7" and "v8" are
#' supported (default = "v8")
#' @param timeout the timeout set in options(), reading/downloading files online
#' might fail with the default timeout of 60 seconds.
#' @param verbose if TRUE will print progress messages (default = FALSE)
#'
#' @return nothing, download files in "install_dir".
#'
#' @examples
#' vargen_install("./", verbose = TRUE)
#' @export
vargen_install <- function(install_dir = "./", gtex_version = "v8", timeout = 10000, verbose = FALSE){
original_timeout <- getOption("timeout")
options(timeout = timeout)
if(verbose) print(paste0("Setting the timeout to '", timeout, "'"))
if (!file.exists(install_dir)){
if(verbose) print(paste0("Creating folder '", install_dir, "'"))
dir.create(install_dir)
}
if(verbose) print("Dowloading FANTOM's enhancer tss associations")
# FANTOM5 TSS associations old link:
#utils::download.file("http://enhancer.binf.ku.dk/presets/enhancer_tss_associations.bed",
# destfile = paste0(install_dir, "/enhancer_tss_associations.bed"))
# FANTOM5 new link:
utils::download.file("https://slidebase.binf.ku.dk/human_enhancers/presets/serve/enhancer_tss_associations",
destfile = paste0(install_dir, "/enhancer_tss_associations.bed"))
cat("\n")
if(verbose) print("Downloading liftOver chain file from ucsc")
# liftOver file
utils::download.file(url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
destfile = paste0(install_dir, "/hg19ToHg38.over.chain.gz"))
R.utils::gunzip(filename = paste0(install_dir, "/hg19ToHg38.over.chain.gz"),
skip = TRUE, remove = TRUE)
# download gwas file:
if(verbose) print("Downloading the gwas catalog file from ebi")
gwasurl <- "https://www.ebi.ac.uk/gwas/api/search/downloads/full"
#gwasurl <- "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations.tsv"
# gwasheaders contains the http header from the URL
gwasheaders <- curlGetHeaders(url = gwasurl, redirect = TRUE, verify = TRUE)
# From the header, we can extract the line where the filename is
gwasheaders <- gwasheaders[grep("filename", gwasheaders)]
# Then we extract the filname from the header line, which should resemble:
# "Content-Disposition: attachement; filename=gwas_catalog_v1.0-associations_e96_r2019-10-14.tsv\r\n"
gwasfilename <- unlist(strsplit(x = gwasheaders, split = "filename="))[2]
# This should remove new lines "\r", "\r\n" or "\n"
gwasfilename <- gsub("\r?\n|\r", "", gwasfilename)
# Now that we have the gwas catalog filename, we can download it:
utils::download.file(url = gwasurl,
destfile = paste0(install_dir, "/", gwasfilename))
cat("\n")
# Add gtex folder
if(verbose) print("Downloading GTEx variant association file... This may take a while")
if(gtex_version == "v7") {
gtex_filename = "GTEx_Analysis_v7_eQTL.tar.gz"
gtex_url = paste0("https://storage.googleapis.com/adult-gtex/bulk-qtl/v8/single-tissue-cis-qtl/", gtex_filename)
} else if(gtex_version == "v8") {
gtex_filename = "GTEx_Analysis_v8_eQTL.tar"
# gtex_url = "https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_Analysis_v8_eQTL.tar"
gtex_url = paste0("https://storage.googleapis.com/adult-gtex/bulk-qtl/v8/single-tissue-cis-qtl/", gtex_filename)
} else {
stop(paste0("Please set 'gtex_version' as v7 or v8, current value: '", gtex_version, "'"))
}
# The tar file is considered as a binary file, so the "mode = wb" option is needed
# or else there is a "corrupt archive" error during untar.
utils::download.file(url = gtex_url,
destfile = paste0(install_dir, "/", gtex_filename),
mode = "wb")
utils::untar(tarfile = paste0(install_dir, "/", gtex_filename),
exdir = paste0(install_dir))
if(file.remove(paste0(install_dir, "/", gtex_filename))){
if(verbose) print(paste0(install_dir, "/", gtex_filename, " untared and removed succesfully"))
} else{
print(paste0("Error while removing ", gtex_filename))
}
# Installing GTEx lookup table
if(verbose) print("Downloading GTEx lookup table... This may take a while")
gtex_lookup_filename <- "GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz"
gtex_lookup_url <- paste0("https://storage.googleapis.com/adult-gtex/references/v8/reference-tables/", gtex_lookup_filename)
utils::download.file(url = gtex_lookup_url,
destfile = paste0(install_dir, "/", gtex_lookup_filename),
mode = "wb")
options(timeout = original_timeout)
if(verbose) print(paste0("Resetting the timeout to previous value '", original_timeout, "'"))
}
#' @title Main vargen function, to get the list of variants
#' @description Will get a list of variants related to certain OMIM morbid IDs.
#' Be aware that some of these variants will not be necessarily associated to the
#' phenotype. We advise to filter the results by annotation ("CADD phred score",
#' "snpEff impact" etc...). If you want a smaller list of variants that are all
#' associated with the disease, then run \code{\link{get_variants_from_phenotypes}}
#' The variants are fetched from the following sources:
#' \itemize{
#' \item OMIM: get variants on the genes related to the disease
#' \item FANTOM5: get the variants on the enhancers of the OMIM genes
#' \item GTEx: get variants impacting the expression of the OMIM genes in specific tissues.
#' \item GWAS: get variants related to the phenotype of interest from the gwas catalog
#' }
#' The pipeline will also annotate the variants using \code{\link[myvariant]{getVariants}}
#'
#' @param vargen_dir directory with the following file (can be generated with
#' \code{\link{vargen_install}})
#' @param omim_morbid_ids a vector containing the omim morbid id(s) of the phenotype(s)
#' of interest. You can search on the Online Mendelian Inheritance in Man website
#' (https://www.omim.org/) or use \code{\link{list_omim_accessions}}
#' @param fantom_corr the minimum correlation (z-score) to consider a FANTOM5
#' enhancer/gene association valid (default: 0.25).
#' A z-score greater than 0 represents an element greater than the mean, this
#' means that this association has more correlation than random motifs.
#' @param outdir the output directory, some files will be written during the
#' running of this function
#' @param gtex_tissues a vector containing the name of the "signif_variant_gene_pairs.txt.gz"
#' files. Output from \code{\link{select_gtex_tissues}} can be used.
#' @param gwas_traits a vector with the trait of interest (as characters). The list
#' of available traits can be obtained with \code{\link{list_gwas_traits}}
#' @param gene_mart optional, a connection to ensembl gene mart, can be created
#' using \code{\link{connect_to_gene_ensembl}} (If missing this function will be
#' used to create the connection).
#' @param snp_mart optional, a connection to ensembl snp mart, can be created
#' using \code{\link{connect_to_snp_ensembl}} (If missing this function will be
#' used to create the connection).
#' @param verbose if TRUE, will print progress messages (default: FALSE)
#' @return a data.frame with the variants fetched from OMIM, FANTOM5, GTEx and GWAS.
#' The data.frame will contain the following columns:
#' \itemize{
#' \item chr (chromosome)
#' \item pos (position of the variant)
#' \item rsid (variant ID)
#' \item ensembl_gene_id ("gene id" of the gene associated with the variant)
#' \item hgnc_symbol ("hgnc symbol" of the gene associated with the variant)
#' \item source ("omim", "fantom5", "gtex" or "gwas")
#' \item trait (the "omim ids" seperated by ';' for omim,fantom and gtex variants and the gwas trait
#' for the gwas variants).
#' }
#'
#' @examples
#' vargen_install("./vargen_data/")
#'
#' # Simple query
#' DM1_simple <- vargen_pipeline(vargen_dir = "./vargen_data/", omim_morbid_ids = "222100",
#' fantom_corr = 0.25, outdir = "./", verbose = TRUE)
#'
#'
#' # Query with gtex and gwas
#' pancreas_tissues <- select_gtex_tissues(gtex_dir = "./vargen_data/GTEx_Analysis_v8_eQTL/",
#' tissues_query = "pancreas")
#'
#' # list_gwas_traits("diabetes")
#'
#' DM1 <- vargen_pipeline(vargen_dir = "./vargen_data/", omim_morbid_ids = "222100",
#' fantom_corr = 0.25, outdir = "./",
#' gtex_tissues = pancreas_tissues,
#' gwas_traits = "Type 1 diabetes", verbose = TRUE)
#' @export
vargen_pipeline <- function(vargen_dir, omim_morbid_ids, fantom_corr = 0.25,
outdir = "./", gtex_tissues, gwas_traits,
gene_mart, snp_mart, verbose = FALSE) {
if(missing(omim_morbid_ids)){
stop("Please provide at least one OMIM morbid id. Stopping now.")
}
if(!dir.exists(outdir)){
if(verbose) print(paste0("Creating folder '", outdir, "'"))
dir.create(outdir)
}
#_____________________________________________________________________________
# Loading the necessary resources
#_____________________________________________________________________________
if(missing(gene_mart) || class(gene_mart) != 'Mart'){
if(verbose) print("Connecting to the gene mart...")
gene_mart <- connect_to_gene_ensembl()
warning("Gene mart not provided (or not a valid Mart object).",
"We used one from connect_to_gene_ensembl() instead.")
}
if(missing(snp_mart) || class(snp_mart) != 'Mart'){
if(verbose) print("Connecting to the snp mart...")
snp_mart <- connect_to_snp_ensembl()
warning("Snp mart not provided (or not a valid Mart object).",
"We used one from connect_to_snp_ensembl() instead.")
}
# no gwas traits = no need to generate the gwas_cat object
if(!missing(gwas_traits)){
if(verbose) print("Building the gwascat object...")
gwas_cat <- create_gwas(vargen_dir)
# Check if the gwas traits are in the gwas catalog:
for(trait in gwas_traits){
if(!(trait %in% gwas_cat$`DISEASE/TRAIT`)){
stop(paste0("gwas trait '", trait, "' not found in gwas catalog, stopping now."))
}
}
}
if(verbose) print(paste0("Reading the enhancer tss association file for FANTOM5... '" ,
vargen_dir, "/enhancer_tss_associations.bed'"))
fantom_df <- prepare_fantom(enhancer_tss_association = paste0(vargen_dir,
"/enhancer_tss_associations.bed"))
hg19ToHg38.over.chain <- paste0(vargen_dir, "/hg19ToHg38.over.chain")
if(!file.exists(hg19ToHg38.over.chain)){
stop(paste0("Can not read: ", hg19ToHg38.over.chain, ", stopping now."))
}
gtex_lookup_file <- paste0(vargen_dir, "/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz")
if(!file.exists(gtex_lookup_file)){
stop(paste0("Can not read: ", gtex_lookup_file, ", stopping now."))
}
#_____________________________________________________________________________
# Getting variants from genes related to OMIM disease
#_____________________________________________________________________________
if(verbose) print("Starting the pipeline...")
omim_all_genes <- data.frame()
master_variants <- data.frame()
for(omim_morbid in omim_morbid_ids){
if(verbose) print(paste0("Getting genes for OMIM: ", omim_morbid))
# First: get all the genes linked to the different omim ids:
omim_genes <- get_omim_genes(omim_morbid, gene_mart)
if(nrow(omim_genes) == 0){
if(verbose) print(paste0("warning: no genes found for omim id: ", omim_morbid))
} else {
omim_all_genes <- rbind(omim_all_genes, omim_genes)
}
}
if(nrow(omim_all_genes) > 0){
omim_all_genes <- unique(omim_all_genes)
# We get the variants on the genes:
genes_variants <- get_genes_variants(genes = omim_all_genes, verbose = verbose)
if(length(genes_variants) != 0) master_variants <- rbind(master_variants,
genes_variants)
# We get the variants on the enhancers of the genes:
fantom_variants <- get_fantom5_variants(fantom_df = fantom_df,
omim_genes = omim_all_genes,
corr_threshold = fantom_corr,
hg19ToHg38.over.chain = hg19ToHg38.over.chain,
verbose = verbose)
if(length(fantom_variants) != 0) master_variants <- rbind(master_variants,
fantom_variants)
# master_variants$trait <- omim_morbid
# print(head(master_variants))
if(verbose) print(paste0("Writing the list of genes to: ", outdir, "/genes_info.tsv"))
# We write the list of genes in a file.
utils::write.table(x = omim_all_genes, quote = FALSE, sep = "\t", row.names = FALSE,
file = paste0(outdir, "/genes_info.tsv"))
#_____________________________________________________________________________
# Getting variants associated with change of expression in GTEx (need tissues as input)
#_____________________________________________________________________________
if(!missing(gtex_tissues)){
if(verbose) print("Getting the GTEx variants...")
gtex_variants <- get_gtex_variants(tissue_files = gtex_tissues,
omim_genes = omim_all_genes,
gtex_lookup_file = gtex_lookup_file,
snp_mart = snp_mart,
verbose = verbose)
if(length(gtex_variants) != 0) master_variants <- rbind(master_variants,
gtex_variants)
} else{
if(verbose) print("No values for 'gtex_tissues', skipping GTEx step...")
}
# Adding the traits for each variant.
master_variants$trait <- ""
for(gene in unique(master_variants$hgnc_symbol)){
# First get the list of omim ids for this gene:
gene_omims <- paste0(unique(omim_all_genes[omim_all_genes$hgnc_symbol == gene,]$mim_morbid_accession),
collapse = ";")
# Then add it to the "trait" column
master_variants[master_variants$hgnc_symbol == gene,]$trait <- gene_omims
}
}
#_____________________________________________________________________________
# Getting variants associated to the disease in the gwas catalog
#_____________________________________________________________________________
# GWAS variants (only if list of gwas traits were given)
if(!missing(gwas_traits)){
if(verbose) print("Getting the gwas variants,,,")
master_variants <- rbind(master_variants, get_gwas_variants(gwas_traits = gwas_traits,
gwas_cat = gwas_cat))
} else{
if(verbose) print("No values for 'gwas_traits', skipping gwas step...")
}
# writing the variants data.frame to a file
if(verbose) print(paste0("Writing the variants to ",
paste0(outdir, "/vargen_variants.tsv")))
utils::write.table(x = unique(master_variants), append = FALSE, quote = FALSE,
sep = "\t", row.names = FALSE,
file = paste0(outdir, "/vargen_variants.tsv"))
return(unique(master_variants))
}
#' @title Get variants related to specific genes.
#' @description Alternative to vargen_pipeline for a set of custom genes.
#' The variants are fetched from the following sources:
#' \itemize{
#' \item GENE: get variants on the genes
#' \item FANTOM5: get the variants on the enhancers / promoters of the genes
#' \item GTEx: get variants impacting the expression of the genes in specific tissues.
#' \item GWAS: get variants related to the phenotype of interest from the gwas catalog
#' }
#'
#' @param vargen_dir directory with the following file (can be generated with
#' \code{\link{vargen_install}})
#' @param gene_ids list of ensembl gene IDs of interest
#' @param fantom_corr the minimum correlation (z-score) to consider a FANTOM5
#' enhancer/gene association valid (default: 0.25).
#' A z-score greater than 0 represents an element greater than the mean, this
#' means that this association has more correlation than random motifs.
#' @param outdir the output directory, some files will be written during the
#' running of this function
#' @param gtex_tissues a vector containing the name of the "signif_variant_gene_pairs.txt.gz"
#' files. Output from \code{\link{select_gtex_tissues}} can be used.
#' @param gwas_traits a vector with the trait of interest (as characters). The list
#' of available traits can be obtained with \code{\link{list_gwas_traits}}
#' @param gene_mart optional, a connection to ensembl gene mart, can be created
#' using \code{\link{connect_to_gene_ensembl}} (If missing this function will be
#' used to create the connection).
#' @param snp_mart optional, a connection to ensembl snp mart, can be created
#' using \code{\link{connect_to_snp_ensembl}} (If missing this function will be
#' used to create the connection).
#' @param verbose if TRUE, will print progress messages (default: FALSE)
#'
#' @return a data.frame with the variants fetched from OMIM, FANTOM5, GTEx and GWAS.
#' The data.frame will contain the following columns:
#' \itemize{
#' \item chr (chromosome)
#' \item pos (position of the variant)
#' \item rsid (variant ID)
#' \item ensembl_gene_id ("gene id" of the gene associated with the variant)
#' \item hgnc_symbol ("hgnc symbol" of the gene associated with the variant)
#' \item source ("omim", "fantom5", "gtex" or "gwas")
#' \item trait (an empty string for the gene, fantom and gtex variants, since
#' this comes from a list of genes, no omim id is associated with them.
#' Contains the gwas trait for the gwas variants. )
#' }
#'
#' @examples
#' # Simple query
#' vargen_install(install_dir = "./vargen_data/")
#'
#' vargen_custom(vargen_dir = "./vargen_data/",
#' gene_ids = c("ENSG00000166603", "ENSG00000155846"),
#' outdir = "./", verbose = TRUE)
#'
#' # With gwas and gtex
#' adipose_tissues <- select_gtex_tissues(gtex_dir = "./vargen_data/GTEx_Analysis_v8_eQTL/",
#' tissues_query = "adipose")
#' vargen_custom(vargen_dir = "./vargen_data/",
#' gene_ids = c("ENSG00000166603", "ENSG00000155846"),
#' outdir = "./", verbose = TRUE,
#' gtex_tissues = adipose_tissues,
#' gwas_traits = "Obesity")
#' @export
vargen_custom <- function(vargen_dir, gene_ids, fantom_corr = 0.25, outdir = "./",
gtex_tissues, gwas_traits, gene_mart, snp_mart,
verbose = FALSE) {
if (!dir.exists(outdir)){
if(verbose) print(paste0("Creating folder '", outdir, "'"))
dir.create(outdir)
}
#_____________________________________________________________________________
# Loading the necessary resources
#_____________________________________________________________________________
if(missing(gene_mart) || class(gene_mart) != 'Mart'){
if(verbose) print("Connecting to the gene mart...")
gene_mart <- connect_to_gene_ensembl()
warning("Gene mart not provided (or not a valid Mart object).",
"We used one from connect_to_gene_ensembl() instead.")
}
if(missing(snp_mart) || class(snp_mart) != 'Mart'){
if(verbose) print("Connecting to the snp mart...")
snp_mart <- connect_to_snp_ensembl()
warning("Snp mart not provided (or not a valid Mart object).",
"We used one from connect_to_snp_ensembl() instead.")
}
# no gwas traits = no need to generate the gwas object
if(!missing(gwas_traits)){
if(verbose) print("Building the gwascat object...")
gwas_cat <- create_gwas(vargen_dir)
# Check if the gwas traits are in the gwas catalog:
for(trait in gwas_traits){
if(!(trait %in% gwas_cat$`DISEASE/TRAIT`)) stop(paste0("gwas trait '", trait, "' not found in gwas catalog, stopping now."))
}
}
if(verbose) print(paste0("Reading the enhancer tss association file for FANTOM5... '" ,
vargen_dir, "/enhancer_tss_associations.bed'"))
fantom_df <- prepare_fantom(enhancer_tss_association = paste0(vargen_dir, "/enhancer_tss_associations.bed"))
hg19ToHg38.over.chain <- paste0(vargen_dir, "/hg19ToHg38.over.chain")
if(!file.exists(hg19ToHg38.over.chain)){
stop(paste0("Can not read: ", hg19ToHg38.over.chain, ", stopping now."))
}
gtex_lookup_file <- paste0(vargen_dir, "/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz")
if(!file.exists(gtex_lookup_file)){
stop(paste0("Can not read: ", gtex_lookup_file, ", stopping now."))
}
master_variants <- data.frame()
#_____________________________________________________________________________
# Getting variants from genes related to OMIM disease
#_____________________________________________________________________________
# Get the genes related to the phenotype:
genes_info <- biomaRt::getBM(attributes = c("ensembl_gene_id", "chromosome_name",
"start_position", "end_position",
"hgnc_symbol"),
filters = c("ensembl_gene_id"),
values = c(gene_ids),
mart = gene_mart, uniqueRows = TRUE)
if(nrow(genes_info) > 0){
if(verbose) print(paste0("Writing the list of genes to: ",
outdir, "/custom_genes_info.tsv"))
utils::write.table(x = genes_info, quote = FALSE, sep = "\t",
row.names = FALSE,
file = paste0(outdir, "/custom_genes_info.tsv"))
# Get the variants on the genes:
master_variants <- get_genes_variants(genes = genes_info, verbose = verbose)
# Replacing "omim" by "gene"
master_variants$source <- "gene"
#_____________________________________________________________________________
# Getting variants on the enhancers of the OMIM genes, using FANTOM5
#_____________________________________________________________________________
fantom_variants <- get_fantom5_variants(fantom_df = fantom_df,
omim_genes = genes_info,
corr_threshold = fantom_corr,
hg19ToHg38.over.chain = hg19ToHg38.over.chain,
verbose = verbose)
if(length(fantom_variants) != 0) master_variants <- rbind(master_variants,
fantom_variants)
#_____________________________________________________________________________
# Getting variants associated with change of expression in GTEx (need tissues as input)
#_____________________________________________________________________________
if(!missing(gtex_tissues)){
if(verbose) print("Getting the GTEx variants...")
gtex_variants <- get_gtex_variants(tissue_files = gtex_tissues,
omim_genes = genes_info,
gtex_lookup_file = gtex_lookup_file,
snp_mart = snp_mart,
verbose = verbose)
if(length(gtex_variants) != 0) master_variants <- rbind(master_variants,
gtex_variants)
} else{
print("No values for 'gtex_tissues', skipping GTEx step...")
}
master_variants$trait <- ""
} else {
print(paste0("No genes found for: ", paste(gene_ids, collapse = ", ")))
}
#_____________________________________________________________________________
# Getting variants associated to the disease in the gwas catalog
#_____________________________________________________________________________
# GWAS variants (only if list of gwas traits were given)
if(!missing(gwas_traits)){
if(verbose) print("Getting the gwas variants...")
master_variants <- rbind(master_variants,
get_gwas_variants(gwas_traits = gwas_traits,
gwas_cat = gwas_cat))
} else{
if(verbose) print("No values for 'gwas_traits', skipping gwas step...")
}
# writing the variants data.frame to a file
if(verbose) print(paste0("Writing the variants to ",
paste0(outdir, "/custom_vargen_variants.tsv")))
utils::write.table(x = unique(master_variants), append = FALSE, quote = FALSE,
sep = "\t", row.names = FALSE,
file = paste0(outdir, "/custom_vargen_variants.tsv"))
return(unique(master_variants))
}
#' @title Generate a plot of the variants on the OMIM genes
#' @description The plot contains 4 tracks:
#' \itemize{
#' \item The chromosome, with a red marker on the gene location
#' \item The ensembl transcripts
#' \item The variant consequences, grouped by type (eg: INTRONIC, STOP LOST etc...),
#' each green bar represent a variant
#' \item The cadd phred score, each dot represent a variant
#' }
#'
#' @param annotated_snps a data.frame from the merging of the outputs of
#' \code{\link{vargen_pipeline}} and \code{\link{annotate_variants}}
#' @param outdir the directory that will contain the plots
#' @param rsid_highlight optional, a vector of rsids, which will be plotted in red
#' on the cadd track.
#' @param device only "pdf" and "png" are supported now.
#' @param verbose if TRUE, will display progress messages
#' @param gene_mart optional, a connection to ensembl gene mart, can be created
#' using \code{\link{connect_to_gene_ensembl}}. If missing this function will be
#' used to create the connection.
#'
#' @return nothing, create the plots in "outdir". One file per gene, the name format
#' is <hgnc_symbol>_<ensembl_id>_GVIZ
#'
#' @examples
#' vargen_install("./vargen_data/")
#'
#' # Simple query
#' gene_mart <- connect_to_gene_ensembl()
#' DM1_simple <- vargen_pipeline(vargen_dir = "./vargen_data/", omim_morbid_ids = "222100",
#' fantom_corr = 0.25, outdir = "./", verbose = TRUE)
#'
#' vargen_visualisation(annotated_snps = DM1_simple, verbose = TRUE,
#' outdir = "./DM1_gviz", device = "png",
#' gene_mart = gene_mart)
#' @export
vargen_visualisation <- function(annotated_snps, outdir = "./", rsid_highlight,
device = "pdf", verbose = FALSE, gene_mart){
if(device != "png" && device != "pdf"){
stop("Please specify device as png or pdf")
}
if (!file.exists(outdir)){
if(verbose) print(paste0("Creating folder '", outdir, "'"))
dir.create(outdir)
}
if(missing(gene_mart) || class(gene_mart) != 'Mart'){
gene_mart <- connect_to_gene_ensembl()
warning("Gene mart not provided (or not a valid Mart object).",
"We used one from connect_to_gene_ensembl() instead.")
}
bckg_col <- "#D95F02"
# Get genes position for omim genes (not gwas / gtex genes as we currently limit
# the plot to start and stop of the gene, and gwas snps often lies outside these coordinates)
omim_genes <- annotated_snps[annotated_snps$source == "omim","ensembl_gene_id"]
genes_list <- biomaRt::getBM(attributes = c("ensembl_gene_id", "chromosome_name",
"start_position", "end_position",
"hgnc_symbol"),
filters = c("ensembl_gene_id"),
values = unique(omim_genes),
mart = gene_mart, uniqueRows = TRUE)
# UCSC chr format
genes_list$chromosome_name <- paste0("chr", genes_list$chromosome_name)
# From get_genes_from_omim output:
genes <- genes_list[,2:4]
colnames(genes) <- c("chromosome", "start", "end")
regions <- GenomicRanges::makeGRangesFromDataFrame(genes)
GenomeInfoDb::genome(regions) <- "hg38"
GenomeInfoDb::seqlevelsStyle(regions) <- "UCSC"
for(i in 1:nrow(genes_list)){
if(verbose) print(genes_list[i,]$hgnc_symbol)
# Subselect the snps on the gene locus only:
track_variants <- subset(annotated_snps, annotated_snps$chr == genes_list[i,]$chromosome_name)
track_variants <- subset(track_variants,track_variants$pos >= genes_list[i,]$start_position)
track_variants <- subset(track_variants,track_variants$pos <= genes_list[i,]$end_position)
track_variants <- track_variants[!(is.na(track_variants$consequence)),]
track_variants <- track_variants[!(track_variants$consequence == ""),]
track_variants$chr <- as.vector(track_variants$chr)
if(nrow(track_variants) != 0){
# Name of consequences are displayed on the left of the track, so we leave 10%
# of the gene length as a margin on the left to avoid cutting names.
gene_length <- genes_list[i,]$end_position - genes_list[i,]$start_position
start_plot <- min(track_variants$pos) - (15/100) * gene_length
end_plot <- max(track_variants$pos) + 10
# Track of variants consequences
conseq_track <- Gviz::AnnotationTrack(background.title = bckg_col,
start = track_variants$pos,
end = track_variants$pos,
chromosome = unique(track_variants$chr),
group = track_variants$consequence,
genome = "hg38", col = "#1B9E77",
fill = "#1B9E77", fontsize = 18,
name = "Variant Consequence")
# Need to have the variants in position order for the data track
cadd_track_variants <- track_variants[!(is.na(track_variants$cadd_phred)),]
cadd_track_variants <- cadd_track_variants[order(cadd_track_variants$pos),]
if(nrow(cadd_track_variants) != 0){
# Track for cadd phrad score, width of 1 because of SNP
cadd_track <- Gviz::DataTrack(background.title = bckg_col,
data = cadd_track_variants$cadd_phred,
start = cadd_track_variants$pos,
width = 1, fontsize = 16,
chromosome = unique(cadd_track_variants$chr),
genome = "hg38", name = "CADD Phred score",
ylim = c(0,40), group = cadd_track_variants$group)
# Overlaying a second data track to highlight some snps:
# Maybe it would be better to use the "group" paramater of the
# DataTrack rather than overlapping data tracks.
cadd_highlights <- data.frame()
if(!missing(rsid_highlight)){
cadd_highlights <- cadd_track_variants[cadd_track_variants$rsid %in% rsid_highlight,]
if(nrow(cadd_highlights) != 0){
cadd_highlight_track <- Gviz::DataTrack(background.title = bckg_col,
data = cadd_highlights$cadd_phred,
start = cadd_highlights$pos,
width = 1, fontsize = 16,
group = cadd_highlights$rsid,
chromosome = unique(cadd_highlights$chr),
genome = "hg38", name = "CADD Phred score",
col = "red", ylim = c(0,40))
}
}
}
gene_track <- Gviz::UcscTrack(background.title = bckg_col, fontsize = 18,
genome = "hg38", track = "knownGene",
chromosome = genes_list[i,]$chromosome_name,
from = start_plot, to = end_plot,
trackType = "GeneRegionTrack",
rstarts = "exonStarts", rends = "exonEnds",
gene = "name", symbol = "name", transcript = "name",
strand = "strand", fill = "#8282d2",
transcriptAnnotation = "transcript",
name = genes_list[i,"hgnc_symbol"])
itrack <- Gviz::IdeogramTrack(genome="hg38", chromosome = genes_list[i,]$chromosome_name)
axis_track <- Gviz::GenomeAxisTrack()
plot_name <- paste0(outdir,"/",genes_list[i,"hgnc_symbol"], "_", genes_list[i,"ensembl_gene_id"], "_GVIZ")
if(device == "pdf") grDevices::pdf(paste0(plot_name,".pdf"), width = 15, height = 10)
if(device == "png") grDevices::png(paste0(plot_name,".png"), width = 15, height = 10, units = "in", res = 300)
tracklist <- list(itrack, axis_track, gene_track, conseq_track)
# If some variants are to be highlighted, we combine the cadd track and the
# highlight track into one "overlay track". Else, if we have cadd scores
# we just plot the "cadd_track" on its own.
if(nrow(cadd_highlights) != 0){
highlight_track <- Gviz::OverlayTrack(background.title = bckg_col,
trackList = list(cadd_track,
cadd_highlight_track))
tracklist <- append(tracklist, highlight_track)
} else {
if(nrow(cadd_track_variants) != 0){
tracklist <- append(tracklist, cadd_track)
}
}
Gviz::plotTracks(tracklist,
from = start_plot,
to = end_plot,
groupAnnotation = "group")
grDevices::dev.off()
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.