#' This function downloads a list of all genes and saves a tibble of Entrez gene IDs, gene number,
#' and synonyms in the global environment
#' @example build_gene_id_syn() -> GENE_ID
#' @export
#'
#' @import tibble
#' @import magrittr
#' @import stringr
#' @import dplyr
#' @import ontologyIndex
#' @import purrr
#' @import readr
#' @import tidyr
#' @import RColorBrewer
#' @import wordcloud
build_gene_id_syn <- function(){
#### RefSeq Gene IDs ####
message("Downloading the RefSeq gene information table")
suppressMessages(
HUMAN_GENE_ID <<- read_tsv("ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", progress = TRUE) %>%
select(GeneID, Symbol, Synonyms)
)
}
#' Aggregate all the databases needed to run PhenolyzerR
#'
#' `build_gene_disease_reference` is downloads gene-disease mappings from OMIM, ClinVar, and the GWAS repository.
#' This will only need to be run once. When the script finishes running,
#' you will have a table called `DB_COMPILED_GENE_DISEASE_SCORES` in your local environment.
#'
#' @return A Tibble
#'
#' @importFrom stats aggregate
#' @importFrom stats complete.cases
#' @importFrom utils download.file
#' @export
build_gene_disease_reference <- function(){
# Compile the Gene - Disease Scores
# This is the meat of this whole project!!
#### OMIM ####
# Get OMIM Genemap
# Hope I'm not breaking any laws by doing this...
message("Downloading the OMIM genemap")
tryCatch(
suppressMessages(suppressWarnings(
OMIM_conf <- read_tsv("https://data.omim.org/downloads/WIFQrs35Rw-0RbsyXS9pWA/genemap.txt", skip = 3) %>%
select("Gene Symbols", "MIM Number", "Confidence")
)),
error = function(e){message("Unable to Connect to OMIM. Please try again! If problem persists, the URL may have expired.")}
)
suppressMessages(suppressWarnings(
MIMs <- read_tsv("https://data.omim.org/downloads/WIFQrs35Rw-0RbsyXS9pWA/morbidmap.txt", skip = 3) %>% # Download the morbidmap
select(-`Cyto Location`, -`Gene Symbols`) # Get rid of cyto location and gene symbols column
))
OMIM <- left_join(OMIM_conf, MIMs, "MIM Number") # Join these tables together
S_SCORE_OMIM <- list("C" = 1, "P" = 0.75, "L" = 0.5, "I" = 0.25)
# These function are used in the next step::
cleanup <- function(term){
term %>%
str_remove("\\(\\d\\)") %>%
str_remove(", \\d+ $") %>%
gsub("^\\W(.*?)\\W*susc?eptibi?lity( to)?,?.*", "\\1", ., perl = TRUE) %>%
str_replace("[Aa]utism \\d+", "autism") %>%
str_replace("\\berthematosus\\b", "erythematosus")
}
eliminateNonwords <- function(term){
term %>%
str_remove_all(regex("[[:punct:]]")) %>%
str_remove("\\bwith\\b.*$") %>%
str_remove("\\bwithout\\b.*$") %>%
trimws
}
OMIM_SCORES <- OMIM %>% select("Gene Symbols", "# Phenotype", "MIM Number", "Confidence")
OMIM_SCORES <- OMIM_SCORES[complete.cases(OMIM_SCORES),] %>% # Remove NA Rows
# Make sure all confidence codes are capital letters
mutate(Confidence = toupper(Confidence)) %>%
# Get rid of rows which have something other than the 4 allowed codes
filter(Confidence %in% c("C", "P", "L", "I")) %>%
# Convert confidence codes to scores
mutate("Score" = unlist(S_SCORE_OMIM[Confidence])) %>%
# Add a Source column
mutate("Source" = "OMIM") %>%
# Get rid of the confidence codes
select(-Confidence) %>%
# Add "OMIM" ot source column
mutate(`MIM Number` = paste("OMIM:", `MIM Number`, sep = "")) %>%
# Get rid of comma separation in gene column, so every column has only one gene
mutate(`Gene Symbols` = strsplit(as.character(`Gene Symbols`), ", ")) %>%
unnest(`Gene Symbols`) %>%
# Cleanup the Disease names column
mutate(`# Phenotype` = `# Phenotype` %>% cleanup %>% eliminateNonwords)
OMIM_SCORES %<>% select(`Gene Symbols`, `# Phenotype`, `MIM Number`, Score, Source)
names(OMIM_SCORES) <- c("GENE", "DISEASE", "DISEASE_ID", "SCORE", "SOURCE")
#### ClinVar ####
message("Downloading ClinVar gene_condition_source_id file")
suppressMessages(clinVar <- read_tsv("ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id"))
clinVar %<>% select(AssociatedGenes, RelatedGenes, DiseaseName, DiseaseMIM)
clinVar$genes <- paste(clinVar$AssociatedGenes, clinVar$RelatedGenes) %>% gsub("NA", "", .) %>% trimws
clinVar %<>% select(-AssociatedGenes, -RelatedGenes)
clinVar <- clinVar[complete.cases(clinVar),]
clinVar <- aggregate(list(count=rep(1,nrow(clinVar))), clinVar, length) %>% as.tibble
SCORE_ClinVar <- function(x){
if (x == 4) return (1.00 * 0.25)
if (x == 3) return (0.75 * 0.25)
if (x == 2) return (0.50 * 0.25)
if (x == 1) return (0.25 * 0.25)
}
ClinVar_SCORES <- clinVar %>% select(genes, DiseaseName, DiseaseMIM, count)
ClinVar_SCORES <- ClinVar_SCORES[complete.cases(ClinVar_SCORES),] %>% # Get rid of missing values
mutate(Score = sapply(count, SCORE_ClinVar)) %>% # Compute the score of each one based on the alignment
mutate(DiseaseMIM = paste("OMIM:", DiseaseMIM, sep = "")) %>%
select(-count) %>% mutate(SOURCE = "ClinVar")
names(ClinVar_SCORES) <- c("GENE", "DISEASE", "DISEASE_ID", "SCORE", "SOURCE")
#### GWAS ####
# Get GWAS associations
message("Downloading GWAS gene condition associations")
suppressMessages(suppressWarnings(
GWAS <- read_tsv("https://www.ebi.ac.uk/gwas/api/search/downloads/full")
))
GWAS_SCORE_WEIGHT <- 0.25
# This function gets rid of anything in parentheses
getRidOfAnnotation <- function(x){x %>% str_remove("\\(.*?\\)")}
GWAS_SCORES <- GWAS %>%
# Select the columns I want
select(`REPORTED GENE(S)`, `DISEASE/TRAIT`, PUBMEDID, `P-VALUE`) %>%
# Get a raw score by calculaitn 1 - p.value and multiple by score weight
mutate(RAW_SCORE = (1 - `P-VALUE`) * GWAS_SCORE_WEIGHT) %>%
select(-`P-VALUE`) %>%
# Remove annotations from disease names
mutate(`DISEASE/TRAIT` = getRidOfAnnotation(`DISEASE/TRAIT`)) %>%
# Remove intergenic of unknown genes
filter(!grepl("[Ii]ntergenic|other|NR", `REPORTED GENE(S)` )) %>%
# Add "PUBMED" to PUBMEDID
mutate(PUBMEDID = str_c("PUBMED:", PUBMEDID)) %>%
# Get rid of duplicate rows and rows with no scores
distinct %>% filter(!is.na(RAW_SCORE)) %>%
# Add Column for source
mutate(SOURCE = "GWAS") %>%
# Get rid of comma separation in gene column, so every column has only one gene
mutate(`REPORTED GENE(S)` = strsplit(as.character(`REPORTED GENE(S)`), ", ")) %>%
unnest(`REPORTED GENE(S)`) %>% select(5,1,2,3,4)
names(GWAS_SCORES) <- c("GENE", "DISEASE", "DISEASE_ID", "SCORE", "SOURCE")
#### Put Everything Together ####
DB_COMPILED_GENE_DISEASE_SCORES <- rbind(OMIM_SCORES, ClinVar_SCORES, GWAS_SCORES)
#### Fix Synonymous Gene Names from Different DBs ####
message("Fixing synonymous gene names across databases")
if (!exists("HUMAN_GENE_ID")) build_gene_id_syn()
syns <- HUMAN_GENE_ID$Synonyms %>% strsplit(split = "\\|")
names(syns) <- HUMAN_GENE_ID$Symbol
need_fixin <- DB_COMPILED_GENE_DISEASE_SCORES[DB_COMPILED_GENE_DISEASE_SCORES$GENE %in% unlist(syns),]
good <- DB_COMPILED_GENE_DISEASE_SCORES[!(DB_COMPILED_GENE_DISEASE_SCORES$GENE %in% unlist(syns)),]
# Make a Thesaurus of gene syonyms, and fix ones which have a gene synonym as their name
genes <- sapply(syns, length)
genes <- rep(names(genes), genes)
syns %<>% unlist(use.names = FALSE)
names(genes) <- syns
need_fixin %<>% mutate(GENE = genes[GENE])
message("Done!")
DB_COMPILED_GENE_DISEASE_SCORES <<- rbind(good, need_fixin) %>% distinct %>% mutate(GENE = trimws(GENE))
}
#' @export
build_gene_prediction_reference <- function(){
message("Downloading the whole bioGrid database")
message("I'll get rid of this huge file :-P")
temp <- tempfile(pattern = "Biogrid", fileext = ".tsv")
download.file("https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.4.160/BIOGRID-ALL-3.4.160.tab2.zip",temp)
bioGrid <- read_tsv(unzip(temp))
unlink(temp)
bioGrid %>%
filter(`Organism Interactor A` == 9606 & `Organism Interactor B` == 9606) %>%
filter(Score != "-") %>%
select(`Official Symbol Interactor A`, `Official Symbol Interactor B`, Score, `Source Database`)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.