This document shows how to feed the Biological Entity Dictionary (BED).
It can be adapted according to specific needs and DB access.
The BED functions used to feed the DB are not exported to avoid
unintended modifications of the DB. To call them, they are preceded
by BED:::
.
In this example several source databases are dumped and their content
is integrated in BED.
Some helper functions are provided to get information from famous databases.
The following chunk is used to configure source versions.
The reDumpThr
object is used to define time intervals during which some
data sources should not be re-downloaded.
## library(knitr) library(BED) library(jsonlite) library(rvest) library(dplyr) library(readr) library(stringr) ## config <- jsonlite::read_json("build_config.json") config <- lapply( config, function(x){ if(!is.character(x)){ return(x) }else{ sub(pattern="___HOME___", replacement=Sys.getenv("HOME"), x = x) } } ) config <- lapply( config, function(x){ if(!is.character(x)){ return(x) }else{ sub(pattern="___ROOT___", replacement=config$ROOT, x = x) } } ) ## wd <- config$BED_WORKING ## # opts_knit$set(root.dir=wd) opts_chunk$set( eval=TRUE, message=FALSE # root.dir=wd ) ## Specific config bedInstance <- config$BED_INSTANCE bedVersion <- config$BED_VERSION ## Identify the latest ensembl release here: https://www.ensembl.org/ ensembl_org_dir <- grep( "_core_", rvest::html_table( rvest::read_html("https://ftp.ensembl.org/pub/current_mysql/") )[[1]]$Name, value=TRUE ) ensembl_org_table <- do.call(rbind, lapply( strsplit(ensembl_org_dir, split="_"), function(x){ gv <- sub("/", "", x[length(x)]) e <- x[length(x)-1] core <- which(x=="core") org <- paste(x[1:(core-1)], collapse=" ") substr(org, 1, 1) <- toupper(substr(org, 1,1)) return(data.frame( organism=org, release=e, genome_version=gv )) } )) ensembl_org_table <- ensembl_org_table[ order(as.numeric(ensembl_org_table$genome_version), decreasing=T), ] ensembl_org_table <- ensembl_org_table[ which(!duplicated(ensembl_org_table$organism)), ] ensembl_release <- unique(ensembl_org_table$release) stopifnot(length(ensembl_release)==1) org <-"Homo sapiens" ensembl_Hsapiens <- list( release=ensembl_release, organism=org, gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)], gdbCref=c( # Gene cross-references DBs "HGNC"="HGNC", "EntrezGene"="EntrezGene", "Vega_gene"="Vega_gene", "Ens_Hs_gene"="Ens_gene" ), gdbAss=c( # Gene associated IDs (DB) "miRBase"="miRBase", "MIM_GENE"="MIM_GENE", "UniGene"="UniGene" ), tdbCref=c( # Transcript cross-references DBs "RefSeq_mRNA"="RefSeq", "RefSeq_ncRNA"="RefSeq", "RefSeq_mRNA_predicted"="RefSeq", "RefSeq_ncRNA_predicted"="RefSeq", "Vega_transcript"="Vega_transcript", "Ens_Hs_transcript"="Ens_transcript" ), pdbCref=c( # Peptide cross-references DBs "RefSeq_peptide"="RefSeq_peptide", "RefSeq_peptide_predicted"="RefSeq_peptide", "Uniprot/SPTREMBL"="Uniprot", "Uniprot/SWISSPROT"="Uniprot", "Vega_translation"="Vega_translation", "Ens_Hs_translation"="Ens_translation" ), canChromosomes=c(1:22, "X", "Y", "MT") ) org <- "Mus musculus" ensembl_Mmusculus <- list( release=ensembl_release, organism=org, gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)], gdbCref=c( # Gene cross-references DBs "MGI"="MGI", "EntrezGene"="EntrezGene", "Vega_gene"="Vega_gene", "Ens_Mm_gene"="Ens_gene" ), gdbAss=c( # Gene associated IDs (DB) "miRBase"="miRBase", "UniGene"="UniGene" ), tdbCref=c( # Transcript cross-references DBs "RefSeq_mRNA"="RefSeq", "RefSeq_ncRNA"="RefSeq", "RefSeq_mRNA_predicted"="RefSeq", "RefSeq_ncRNA_predicted"="RefSeq", "Vega_transcript"="Vega_transcript", "Ens_Mm_transcript"="Ens_transcript" ), pdbCref=c( # Peptide cross-references DBs "RefSeq_peptide"="RefSeq_peptide", "RefSeq_peptide_predicted"="RefSeq_peptide", "Uniprot/SPTREMBL"="Uniprot", "Uniprot/SWISSPROT"="Uniprot", "Vega_translation"="Vega_translation", "Ens_Mm_translation"="Ens_translation" ), canChromosomes=c(1:19, "X", "Y", "MT") ) org <- "Rattus norvegicus" ensembl_Rnorvegicus <- list( release=ensembl_release, organism=org, gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)], gdbCref=c( # Gene cross-references DBs "RGD"="RGD", "EntrezGene"="EntrezGene", "Vega_gene"="Vega_gene", "Ens_Rn_gene"="Ens_gene" ), gdbAss=c( # Gene associated IDs (DB) "miRBase"="miRBase", "UniGene"="UniGene" ), tdbCref=c( # Transcript cross-references DBs "RefSeq_mRNA"="RefSeq", "RefSeq_ncRNA"="RefSeq", "RefSeq_mRNA_predicted"="RefSeq", "RefSeq_ncRNA_predicted"="RefSeq", "Vega_transcript"="Vega_transcript", "Ens_Rn_transcript"="Ens_transcript" ), pdbCref=c( # Peptide cross-references DBs "RefSeq_peptide"="RefSeq_peptide", "RefSeq_peptide_predicted"="RefSeq_peptide", "Uniprot/SPTREMBL"="Uniprot", "Uniprot/SWISSPROT"="Uniprot", "Vega_translation"="Vega_translation", "Ens_Rn_translation"="Ens_translation" ), canChromosomes=c(1:20, "X", "Y", "MT") ) org <- "Sus scrofa" ensembl_Sscrofa <- list( release=ensembl_release, organism=org, gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)], gdbCref=c( # Gene cross-references DBs "EntrezGene"="EntrezGene", "Vega_gene"="Vega_gene", "Ens_Ss_gene"="Ens_gene" ), gdbAss=c( # Gene associated IDs (DB) "miRBase"="miRBase", "UniGene"="UniGene" ), tdbCref=c( # Transcript cross-references DBs "RefSeq_mRNA"="RefSeq", "RefSeq_ncRNA"="RefSeq", "RefSeq_mRNA_predicted"="RefSeq", "RefSeq_ncRNA_predicted"="RefSeq", "Vega_transcript"="Vega_transcript", "Ens_Ss_transcript"="Ens_transcript" ), pdbCref=c( # Peptide cross-references DBs "RefSeq_peptide"="RefSeq_peptide", "RefSeq_peptide_predicted"="RefSeq_peptide", "Uniprot/SPTREMBL"="Uniprot", "Uniprot/SWISSPROT"="Uniprot", "Vega_translation"="Vega_translation", "Ens_Ss_translation"="Ens_translation" ), canChromosomes=c(1:18, "X", "Y", "MT") ) org <- "Danio rerio" ensembl_Drerio <- list( release=ensembl_release, organism=org, gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)], gdbCref=c( # Gene cross-references DBs "EntrezGene"="EntrezGene", "ZFIN_ID"="ZFIN_gene", "Vega_gene"="Vega_gene", "Ens_Dr_gene"="Ens_gene" ), gdbAss=c( # Gene associated IDs (DB) "miRBase"="miRBase", "UniGene"="UniGene" ), tdbCref=c( # Transcript cross-references DBs "RefSeq_mRNA"="RefSeq", "RefSeq_ncRNA"="RefSeq", "RefSeq_mRNA_predicted"="RefSeq", "RefSeq_ncRNA_predicted"="RefSeq", "Vega_transcript"="Vega_transcript", "Ens_Dr_transcript"="Ens_transcript" ), pdbCref=c( # Peptide cross-references DBs "RefSeq_peptide"="RefSeq_peptide", "RefSeq_peptide_predicted"="RefSeq_peptide", "Uniprot/SPTREMBL"="Uniprot", "Uniprot/SWISSPROT"="Uniprot", "Vega_translation"="Vega_translation", "Ens_Dr_translation"="Ens_translation" ), canChromosomes=c(1:25, "MT") ) ## General config reDumpThr <- as.difftime(config$SOURCE_REDUMP_THR, units="days") curDate <- Sys.Date()
BED is based on Neo4j.
The S01-NewBED-Container.sh shows how to run it in a docker container.
Because the import functions use massively the LOAD CSV
Neo4j query,
the feeding of the BED database can only be down from the
computer hosting the Neo4j relevant instance.
The chunk below shows how to connect to BED. In this example, neo4j authentication is disabled.
connectToBed( url=sprintf("localhost:%s", config$NJ_HTTP_PORT), remember=FALSE, useCache=TRUE, importPath=config$BED_IMPORT ) clearBedCache(force = TRUE, hard = TRUE)
Do not go further if your BED DB is not empty.
dbSize <- bedCall(cypher, 'MATCH (n) RETURN count(n)')[,1] if(dbSize!=0){ stop("BED DB is not empty ==> clean it before loading the content below") }
print(bedInstance) print(bedVersion) BED:::setBedVersion(bedInstance=bedInstance, bedVersion=bedVersion)
Start: r Sys.time()
BED:::loadBedModel()
End: r Sys.time()
Information is downloaded if older than r reDumpThr
days
according to the reDumpThr
object.
Start: r Sys.time()
BED:::loadNcbiTax( reDumpThr=reDumpThr, ddir=wd, orgOfInt=c( "Homo sapiens", "Rattus norvegicus", "Mus musculus", "Sus scrofa", "Danio rerio" ), curDate=curDate )
End: r Sys.time()
BED:::registerBEDB( name="Ens_gene", description="Ensembl gene", currentVersion=ensembl_release, idURL='http://www.ensembl.org/id/%s' )
BED:::registerBEDB( name="Ens_transcript", description="Ensembl transcript", currentVersion=ensembl_release, idURL='http://www.ensembl.org/id/%s' )
BED:::registerBEDB( name="Ens_translation", description="Ensembl peptides", currentVersion=ensembl_release, idURL='http://www.ensembl.org/id/%s' )
ensembl <- ensembl_Drerio print(ensembl)
Start: r Sys.time()
BED:::getEnsemblGeneIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$gdbCref, dbAss=ensembl$gdbAss, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblTranscriptIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$tdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblPeptideIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$pdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
ensembl <- ensembl_Hsapiens print(ensembl)
Start: r Sys.time()
BED:::getEnsemblGeneIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$gdbCref, dbAss=ensembl$gdbAss, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblTranscriptIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$tdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblPeptideIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$pdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
ensembl <- ensembl_Mmusculus print(ensembl)
Start: r Sys.time()
BED:::getEnsemblGeneIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$gdbCref, dbAss=ensembl$gdbAss, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblTranscriptIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$tdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblPeptideIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$pdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
ensembl <- ensembl_Rnorvegicus print(ensembl)
Start: r Sys.time()
BED:::getEnsemblGeneIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$gdbCref, dbAss=ensembl$gdbAss, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblTranscriptIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$tdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblPeptideIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$pdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
ensembl <- ensembl_Sscrofa print(ensembl)
Start: r Sys.time()
BED:::getEnsemblGeneIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$gdbCref, dbAss=ensembl$gdbAss, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblTranscriptIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$tdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getEnsemblPeptideIds( organism=ensembl$organism, release=ensembl$release, gv=ensembl$gv, ddir=wd, dbCref=ensembl$pdbCref, canChromosomes=ensembl$canChromosomes ) gc()
End: r Sys.time()
Information is downloaded if older than r reDumpThr
days
according to the reDumpThr
object.
BED:::dumpNcbiDb( taxOfInt = c(), reDumpThr=reDumpThr, ddir=wd, toLoad=c(), curDate=curDate )
BED:::registerBEDB( name="EntrezGene", description="NCBI gene", currentVersion=format(dumpDate, "%Y%m%d"), idURL='https://www.ncbi.nlm.nih.gov/gene/%s' )
BED:::registerBEDB( name="RefSeq", description="NCBI nucleotide", currentVersion=format(dumpDate, "%Y%m%d"), idURL='https://www.ncbi.nlm.nih.gov/nuccore/%s' )
BED:::registerBEDB( name="RefSeq_peptide", description="NCBI protein", currentVersion=format(dumpDate, "%Y%m%d"), idURL='https://www.ncbi.nlm.nih.gov/protein/%s' )
Start: r Sys.time()
BED:::getNcbiGeneTransPep( organism="Danio rerio", ddir=wd, curDate=curDate ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getNcbiGeneTransPep( organism="Homo sapiens", ddir=wd, curDate=curDate ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getNcbiGeneTransPep( organism="Mus musculus", ddir=wd, curDate=curDate ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getNcbiGeneTransPep( organism="Rattus norvegicus", ddir=wd, curDate=curDate ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getNcbiGeneTransPep( organism="Sus scrofa", ddir=wd, curDate=curDate ) gc()
End: r Sys.time()
Start: r Sys.time()
message("Direct cross-references with Uniprot") BED:::dumpNcbiDb( taxOfInt="", reDumpThr=Inf, ddir=wd, toLoad="gene_refseq_uniprotkb_collab", curDate=Sys.Date() ) gene_refseq_uniprotkb_collab <- gene_refseq_uniprotkb_collab[ gene_refseq_uniprotkb_collab$method %in% c("uniprot", "identical"), ] gene_refseq_uniprotkb_collab$NCBI_protein_accession <- sub( "[.].*$", "", gene_refseq_uniprotkb_collab$NCBI_protein_accession ) for(org in listOrganisms()){ message(" ", org) curRS <- getBeIds( be="Peptide", source="RefSeq_peptide", organism=org, restricted=TRUE ) toAdd <- gene_refseq_uniprotkb_collab[ which( gene_refseq_uniprotkb_collab$NCBI_protein_accession %in% curRS$id ), ] ## External DB IDs toImport <- unique(toAdd[, "UniProtKB_protein_accession", drop=F]) colnames(toImport) <- "id" BED:::loadBE( d=toImport, be="Peptide", dbname="Uniprot", taxId=NA ) ## The cross references toImport <- toAdd[ , c("NCBI_protein_accession", "UniProtKB_protein_accession") ] colnames(toImport) <- c("id1", "id2") BED:::loadCorrespondsTo( d=toImport, db1="RefSeq_peptide", db2="Uniprot", be="Peptide" ) }
End: r Sys.time()
dbname <- "HGNC" hgnc_date <- jsonlite::read_json( "https://www.genenames.org/cgi-bin/statistics/db-last-updated" )$date BED:::registerBEDB( name=dbname, description="HUGO Gene Nomenclature Committee", currentVersion=hgnc_date, idURL='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:%s' )
hgnc <- read_tsv( file="https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt", col_types = "ccccccccccccccDDDDcccccccccccccccccccccccccccccccccccc" ) hgnc_genes <- hgnc |> select( hgnc_id, symbol, name, locus_group, locus_type, status, location #, location_sortable ) |> mutate(id = str_remove(hgnc_id, "^HGNC:")) |> relocate(id, .after = "hgnc_id") |> distinct() get_hgnc_xref <- function(colname){ toRet <- hgnc |> select(all_of(c("hgnc_id", colname))) values <- toRet |> pull(colname) |> strsplit("[|]") toRet <- tibble( hgnc_id=rep(toRet$hgnc_id, lengths(values)), value = unlist(values) ) return(toRet) } hgnc_alias_symbols <- get_hgnc_xref("alias_symbol") |> filter(!is.na(value)) |> rename("symbol"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_alias_names <- get_hgnc_xref("alias_name") |> filter(!is.na(value)) |> rename("name"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_previous_symbols <- get_hgnc_xref("prev_symbol") |> filter(!is.na(value)) |> rename("symbol"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_previous_names <- get_hgnc_xref("prev_name") |> filter(!is.na(value)) |> rename("name"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_entrez <- get_hgnc_xref("entrez_id") |> filter(!is.na(value)) |> rename("entrez"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_ensembl <- get_hgnc_xref("ensembl_gene_id") |> filter(!is.na(value)) |> rename("ensembl"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_omim <- get_hgnc_xref("omim_id") |> filter(!is.na(value)) |> rename("omim"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) hgnc_orphanet <- get_hgnc_xref("orphanet") |> filter(!is.na(value)) |> rename("orphanet"="value") |> mutate(id = str_remove(hgnc_id, "^HGNC:")) # hgnc_mgd <- get_hgnc_xref("mgd_id") |> # filter(!is.na(value)) |> # rename("mgd_id"="value") |> # mutate("mgd" = str_remove(mgd_id, "MGI:")) # hgnc_rgd <- get_hgnc_xref("rgd_id") |> # filter(!is.na(value)) |> # rename("rgd_id"="value") |> # mutate("rgd" = str_remove(rgd_id, "RGD:"))
toLoad <- hgnc_genes |> select(id) BED:::loadBE( toLoad, be = "Gene", dbname = dbname, version = hgnc_date, taxId = "9606" )
toLoad <- dplyr::bind_rows( hgnc_genes |> dplyr::select(id, symbol) |> dplyr::mutate(canonical = TRUE), hgnc_alias_symbols |> dplyr::select(id, symbol) |> dplyr::mutate(canonical = FALSE), hgnc_previous_symbols |> dplyr::select(id, symbol) |> dplyr::mutate(canonical = FALSE) )|> dplyr::distinct(id, symbol, .keep_all = TRUE) BED:::loadBESymbols(toLoad, be = "Gene", dbname = dbname)
toLoad <- dplyr::bind_rows( hgnc_genes |> dplyr::select(id, name) |> dplyr::mutate(canonical = TRUE), hgnc_alias_names |> dplyr::select(id, name) |> dplyr::mutate(canonical = FALSE), hgnc_previous_names |> dplyr::select(id, name) |> dplyr::mutate(canonical = FALSE) )|> dplyr::distinct(id, name, .keep_all = TRUE) BED:::loadBENames(toLoad, be = "Gene", dbname = dbname)
crdb <- "EntrezGene" toLoad <- hgnc_entrez |> dplyr::select(id = entrez) |> dplyr::distinct() BED:::loadBE( d=toLoad, be="Gene", dbname=crdb, taxId=NA ) ## The cross references toLoad <- hgnc_entrez |> dplyr::select(id1 = entrez, id2 = id) |> dplyr::distinct() BED:::loadCorrespondsTo( d=as.data.frame(toLoad), db1=crdb, db2=dbname, be="Gene" )
crdb <- "Ens_gene" toLoad <- hgnc_ensembl |> dplyr::select(id = ensembl) |> dplyr::distinct() BED:::loadBE( d=toLoad, be="Gene", dbname=crdb, taxId=NA ) ## The cross references toLoad <- hgnc_ensembl |> dplyr::select(id1 = ensembl, id2 = id) |> dplyr::distinct() BED:::loadCorrespondsTo( d=as.data.frame(toLoad), db1=crdb, db2=dbname, be="Gene" )
crdb <- "MIM_GENE" toLoad <- hgnc_omim |> dplyr::select(id = omim) |> dplyr::distinct() BED:::loadBE( d=toLoad, be="Gene", dbname=crdb, taxId=NA ) ## The cross references (associations) toLoad <- hgnc_omim |> dplyr::select(id1 = omim, id2 = id) |> dplyr::distinct() BED:::loadIsAssociatedTo( d=as.data.frame(toLoad), db1=crdb, db2=dbname, be="Gene" )
Release is defined according to the reldate.txt file on the Uniprot FTP and data is downloaded only if not already done for the current release.
ftp <- "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions" avRel <- readLines(file.path(ftp, "reldate.txt"), n=1) avRel <- sub( "^UniProt Knowledgebase Release ", "", sub(" consists of:$", "", avRel) ) if(is.na(as.Date(paste0(avRel, "_01"), format="%Y_%m_%d"))){ print(avRel) stop(sprintf("Check reldate.txt file on %s", ftp)) } BED:::registerBEDB( name="Uniprot", description="Uniprot", currentVersion=avRel, idURL='http://www.uniprot.org/uniprot/%s' )
Start: r Sys.time()
BED:::getUniprot( organism="Danio rerio", taxDiv="vertebrates", release=avRel, ddir=wd ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getUniprot( organism="Homo sapiens", taxDiv="human", release=avRel, ddir=wd ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getUniprot( organism="Mus musculus", taxDiv="rodents", release=avRel, ddir=wd ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getUniprot( organism="Rattus norvegicus", taxDiv="rodents", release=avRel, ddir=wd ) gc()
End: r Sys.time()
Start: r Sys.time()
BED:::getUniprot( organism="Sus scrofa", taxDiv="mammals", release=avRel, ddir=wd ) gc()
End: r Sys.time()
Start: r Sys.time()
message("Indirect cross-references with Uniprot") dumpDir <- file.path(wd, "NCBI-gene-DATA") f <- "gene2accession.gz" if(file.exists(dumpDir)){ load(file.path(dumpDir, "dumpDate.rda")) message("Last download: ", dumpDate) if(curDate - dumpDate > reDumpThr | !file.exists(file.path(dumpDir, f))){ toDownload <- TRUE }else{ toDownload <- FALSE } }else{ message("Not downloaded yet") toDownload <- TRUE } if(toDownload){ ftp <- "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" dlok <- try(download.file( url=paste0(ftp, f), destfile=file.path(dumpDir, f), method="wget", quiet=T ), silent=T) }else{ message("Existing data are going to be used") } cn <- readLines(file.path(dumpDir, f), n=1) cn <- sub("^#", "", cn) cn <- unlist(strsplit(cn, split="[ \t]")) for(org in listOrganisms()){ message(" ", org) tid <- getTaxId(org) toAdd <- read.table( text=system( sprintf("zgrep ^%s %s", tid, file.path(dumpDir, f)), intern=TRUE ), sep="\t", header=F, stringsAsFactors=F, quote="", comment.char="" ) colnames(toAdd) <- cn toAdd <- toAdd[ which(toAdd$tax_id==tid), c("tax_id", "GeneID", "protein_accession.version") ] toAdd$pacc <- sub("[.].*$", "", toAdd$protein_accession.version) curUP <- getBeIdConvTable( from="Gene", from.source="BEDTech_gene", organism=org, to="Peptide", to.source="Uniprot", restricted=TRUE ) toAdd <- merge( toAdd[,c("GeneID", "pacc")], curUP[,c("from", "to")], by.x="pacc", by.y="to", all=FALSE ) toAdd <- toAdd[,c("from", "GeneID")] toAdd$from <- as.character(toAdd$from) toAdd$GeneID <- as.character(toAdd$GeneID) colnames(toAdd) <- c("id1", "id2") BED:::loadIsAssociatedTo( d=toAdd, db1="BEDTech_gene", db2="EntrezGene", be="Gene" ) }
End: r Sys.time()
Start: r Sys.time()
## Dump data from ensembl orgOfInt <- c( "homo_sapiens", "mus_musculus", "rattus_norvegicus", "sus_scrofa", "danio_rerio" ) data_dir <- file.path(wd, "ensembl_homologies") dir.create(data_dir, showWarnings = FALSE) ensembl_homologies <- c() for(org in orgOfInt){ ## Proteins f_url <- sprintf( "https://ftp.ensembl.org/pub/release-113/tsv/ensembl-compara/homologies/%s/Compara.%s.protein_default.homologies.tsv", org, ensembl_release ) f_name <- sprintf( "%s.Compara.%s.protein_default.homologies.tsv", org, ensembl_release ) f_path <- file.path(data_dir, f_name) if(!file.exists(f_path)){ download.file(url = f_url, destfile = f_path) } to_add <- readr::read_tsv( f_path, col_types = "c" ) to_add <- to_add |> filter( is_high_confidence == "1", species %in% orgOfInt, homology_species %in% orgOfInt, species != homology_species ) |> select(gene_stable_id, homology_gene_stable_id, species, homology_species) ensembl_homologies <- bind_rows( ensembl_homologies, to_add ) ## ncRNA f_url <- sprintf( "https://ftp.ensembl.org/pub/release-113/tsv/ensembl-compara/homologies/%s/Compara.%s.ncrna_default.homologies.tsv", org, ensembl_release ) f_name <- sprintf( "%s.Compara.%s.ncrna_default.homologies.tsv", org, ensembl_release ) f_path <- file.path(data_dir, f_name) if(!file.exists(f_path)){ download.file(url = f_url, destfile = f_path) } to_add <- readr::read_tsv( f_path, col_types = "c" ) to_add <- to_add |> filter( is_high_confidence == "1", species %in% orgOfInt, homology_species %in% orgOfInt, species != homology_species ) |> select(gene_stable_id, homology_gene_stable_id, species, homology_species) ensembl_homologies <- bind_rows( ensembl_homologies, to_add ) } toImport <- ensembl_homologies |> dplyr::select(id1 = gene_stable_id, id2 = homology_gene_stable_id) |> as.data.frame() BED:::loadIsHomologOf( d=toImport, db1="Ens_gene", db2="Ens_gene", be="Gene" )
End: r Sys.time()
Start: r Sys.time()
##################################### gdbname <- "EntrezGene" taxOfInt <- unlist(lapply( c( "Homo sapiens", "Mus musculus", "Rattus norvegicus", "Sus scrofa", "Danio rerio" ), getTaxId )) for(i in 1:length(taxOfInt)){ BED:::dumpNcbiDb( taxOfInt=taxOfInt[i], reDumpThr=reDumpThr, ddir=wd, toLoad=c("gene_orthologs"), curDate=curDate ) toImport <- gene_orthologs[ which( gene_orthologs$tax_id %in% taxOfInt & gene_orthologs$Other_tax_id %in% taxOfInt & gene_orthologs$relationship == "Ortholog" ), c("GeneID", "Other_GeneID") ] if(nrow(toImport)>0){ colnames(toImport) <- c("id1", "id2") toImport <- dplyr::mutate_all(toImport, as.character) BED:::loadIsHomologOf( d=toImport, db1=gdbname, db2=gdbname, be="Gene" ) } } gc()
End: r Sys.time()
library(GEOquery) geoPath <- file.path(wd, "geo") dir.create(geoPath, showWarnings=FALSE)
Start: r Sys.time()
## Import plateform platName <- "GPL1708" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping with Entrez d <- Table(gds) toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "GENE")] colnames(toImport) <- c("probeID", "id") toImport$probeID <- as.character(toImport$probeID) toImport$id <- as.character(toImport$id) toImport <- toImport[which(!is.na(toImport$id)),] toImport <- unique(toImport) dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname ) # ## Import mapping with UniGene # toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "UNIGENE_ID")] # colnames(toImport) <- c("probeID", "id") # toImport$probeID <- as.character(toImport$probeID) # toImport$id <- as.character(toImport$id) # toImport <- toImport[which(!is.na(toImport$id) & toImport$id!=""),] # dbname <- "UniGene" # ## # BED:::loadProbes( # d=toImport, # be=be, # platform=platName, # dbname=dbname # )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL6480" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping with Entrez d <- Table(gds) toImport <- d[which(!is.na(d$ID)), c("ID", "GENE")] colnames(toImport) <- c("probeID", "id") toImport$probeID <- as.character(toImport$probeID) toImport$id <- as.character(toImport$id) toImport <- toImport[which(!is.na(toImport$id)),] dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname ) # ## Import mapping with UniGene # toImport <- d[which(!is.na(d$ID)), c("ID", "UNIGENE_ID")] # colnames(toImport) <- c("probeID", "id") # toImport$probeID <- as.character(toImport$probeID) # toImport$id <- as.character(toImport$id) # toImport <- toImport[which(!is.na(toImport$id)),] # dbname <- "UniGene" # ## # BED:::loadProbes( # d=toImport, # be=be, # platform=platName, # dbname=dbname # )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL570" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL571" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL13158" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL96" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL1261" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL1355" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- strsplit( as.character(d$ENTREZ_GENE_ID), split=" /// " ) names(toImport) <- d$ID toImport <- stack(toImport) toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL10558" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- d[,c("Entrez_Gene_ID", "ID")] toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),] dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL6947" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- d[,c("Entrez_Gene_ID", "ID")] toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),] dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL6885" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title d <- Table(gds) # e <- getBeIds( # be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE # ) # sum(d$Entrez_Gene_ID %in% e$id) < sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id) be <- "Transcript" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping toImport <- d[,c("RefSeq_ID", "ID")] toImport[,1] <- as.character(toImport[,1]) toImport[,1] <- sub("[.].*$", "", toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),] dbname <- "RefSeq" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL6887" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title d <- Table(gds) # e <- getBeIds( # be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE # ) # sum(d$Entrez_Gene_ID %in% e$id) > sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id) be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping toImport <- d[,c("Entrez_Gene_ID", "ID")] toImport[,1] <- as.character(toImport[,1]) # toImport[,1] <- sub("[.].*$", "", toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),] dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
## Import plateform platName <- "GPL6101" gds <- getGEO(platName, destdir=geoPath) platDesc <- Meta(gds)$title be <- "Gene" ## BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping d <- Table(gds) toImport <- d[,c("Entrez_Gene_ID", "ID")] toImport[,1] <- as.character(toImport[,1]) toImport[,2] <- as.character(toImport[,2]) colnames(toImport) <- c("id", "probeID") toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),] dbname <- "EntrezGene" ## BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname )
End: r Sys.time()
Start: r Sys.time()
to_exclude <- c( "affy_huex_1_0_st_v2", "affy_moex_1_0_st_v1", "affy_raex_1_0_st_v1" ) dump_ensembl_func <- function(ens_def){ base_url <- sprintf( "https://ftp.ensembl.org/pub/current_mysql/%s_funcgen_%s_%s", stringr::str_replace(tolower(ens_def$organism), " ", "_"), ens_def$release, ens_def$gv ) data_dir <- file.path( wd, sprintf( "%s_funcgen_%s_%s", stringr::str_replace(tolower(ens_def$organism), " ", "_"), ens_def$release, ens_def$gv ) ) dir.create(data_dir, showWarnings = FALSE) to_download <- c( "array.txt.gz", "probe.txt.gz", "probe_set.txt.gz", "probe_transcript.txt.gz", "probe_set_transcript.txt.gz" ) for(f in to_download){ fp <- file.path(data_dir, f) if(!file.exists(fp)){ download.file(file.path(base_url, f), fp) } } } load_ensembl_probes <- function(ens_def){ data_dir <- file.path( wd, sprintf( "%s_funcgen_%s_%s", stringr::str_replace(tolower(ens_def$organism), " ", "_"), ens_def$release, ens_def$gv ) ) array <- readr::read_tsv( file.path(data_dir, "array.txt.gz"), col_names = c( "array_id", "name", "format", "vendor", "description", "type", "class", "is_probeset_array", "is_linked_array", "has_sense_interrogation" ), col_types = "ccccccclll", na = "\\N" ) array <- array |> dplyr::filter( format %in% c("EXPRESSION"), !.data$name %in% to_exclude ) probe_arrays <- array |> filter(!is_probeset_array) |> pull(array_id) probe <- readr::read_tsv( file.path(data_dir, "probe.txt.gz"), col_names = c( "probe_id", "probe_set_id", "name", "length", "array_chip_id", "class", "description", "probe_seq_id" ), col_types = "cccnccccc", na = "\\N" ) probe <- probe |> filter(array_chip_id %in% probe_arrays) probe_set <- readr::read_tsv( file.path(data_dir, "probe_set.txt.gz"), col_names = c( "probe_set_id", "name", "size", "family", "array_chip_id" ), col_types = "cccncc", na = "\\N" ) probe_transcript <- readr::read_tsv( file.path(data_dir, "probe_transcript.txt.gz"), col_names = c( "probe_transcript_id", "probe_id", "stable_id", "description" ), col_types = "cccc", na = "\\N" ) probe_transcript <- probe_transcript |> filter(probe_id %in% probe$probe_id) probe_set_transcript <- readr::read_tsv( file.path(data_dir, "probe_set_transcript.txt.gz"), col_names = c( "probe_set_transcript_id", "probe_set_id", "stable_id", "description" ), col_types = "cccc", na = "\\N" ) be <- "Transcript" dbname <- "Ens_transcript" for(i in 1:nrow(array)){ message(" ", i, "/", nrow(array), " platforms") message(" ", Sys.time()) platName <- tolower(paste( str_replace(ens_def$organism, " ", "_"), array$vendor[i], array$name[i], sep = "_" )) is_probe_set <- array$is_probeset_array[i] platDesc <- paste( array$vendor[i], array$name[i], array$type[i], ifelse(is_probe_set, "probe set", "probe"), ens_def$organism, "(Ensembl mapping)" ) array_id <- array$array_id[i] if(is_probe_set){ toImport <- probe_set |> dplyr::filter(.data$array_chip_id==!!array_id) |> dplyr::select("probe_set_id", "name") |> dplyr::inner_join( probe_set_transcript |> dplyr::select("probe_set_id", "stable_id"), by = "probe_set_id" ) |> dplyr::select("probeID" = "name", "id" = "stable_id") |> dplyr::distinct() }else{ toImport <- probe |> dplyr::filter(.data$array_chip_id==!!array_id) |> dplyr::select("probe_id", "name") |> dplyr::inner_join( probe_transcript |> dplyr::select("probe_id", "stable_id"), by = "probe_id" ) |> dplyr::select("probeID" = "name", "id" = "stable_id") |> dplyr::distinct() } if(nrow(toImport) > 0){ ## Platform description BED:::loadPlf(name=platName, description=platDesc, be=be) ## Import mapping with Ens_transcript BED:::loadProbes( d=toImport, be=be, platform=platName, dbname=dbname ) } } } message("Ensembl probes for Drerio") dump_ensembl_func(ensembl_Drerio) load_ensembl_probes(ensembl_Drerio) message("Ensembl probes for Hsapiens") dump_ensembl_func(ensembl_Hsapiens) load_ensembl_probes(ensembl_Hsapiens) message("Ensembl probes for Mmusculus") dump_ensembl_func(ensembl_Mmusculus) load_ensembl_probes(ensembl_Mmusculus) message("Ensembl probes for Rnorvegicus") dump_ensembl_func(ensembl_Rnorvegicus) load_ensembl_probes(ensembl_Rnorvegicus) message("Ensembl probes for Sscrofa") dump_ensembl_func(ensembl_Sscrofa) load_ensembl_probes(ensembl_Sscrofa)
End: r Sys.time()
otherIdURL <- list( # "HGNC"='http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=%s', # "HGNC"='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:%s', "miRBase"='http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s', "Vega_gene"='http://vega.sanger.ac.uk/id/%s', "UniGene"='https://www.ncbi.nlm.nih.gov/unigene?term=%s', "Vega_transcript"='http://vega.sanger.ac.uk/id/%s', "MGI"='http://www.informatics.jax.org/marker/MGI:%s', "Vega_translation"='http://vega.sanger.ac.uk/id/%s', "RGD"='https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=%s', "MIM_GENE"='http://www.omim.org/entry/%s', "ZFIN_gene"='http://zfin.org/%s' ) for(db in names(otherIdURL)){ BED:::registerBEDB( name=db, idURL=otherIdURL[[db]] ) }
Start: r Sys.time()
BED:::loadLuceneIndexes()
End: r Sys.time()
clearBedCache(force = TRUE, hard = TRUE) sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.