In patzaw/BED: Biological Entity Dictionary (BED)

Introduction

This document shows how to feed the Biological Entity Dictionary (BED). It can be adapted according to specific needs and DB access. The BED functions used to feed the DB are not exported to avoid unintended modifications of the DB. To call them, they are preceded by BED:::.

In this example several source databases are dumped and their content is integrated in BED. Some helper functions are provided to get information from famous databases. The following chunk is used to configure source versions. The reDumpThr object is used to define time intervals during which some data sources should not be re-downloaded.

##
library(knitr)
library(BED)
library(jsonlite)
library(rvest)
library(dplyr)
library(readr)
library(stringr)
##
config <- jsonlite::read_json("build_config.json")
config <- lapply(
  config, function(x){
     if(!is.character(x)){
        return(x)
     }else{
        sub(pattern="___HOME___", replacement=Sys.getenv("HOME"), x = x)
     }
  }
)
config <- lapply(
  config, function(x){
     if(!is.character(x)){
        return(x)
     }else{
        sub(pattern="___ROOT___", replacement=config$ROOT, x = x)
     }
  }
)
##
wd <- config$BED_WORKING

##
# opts_knit$set(root.dir=wd)
opts_chunk$set(
   eval=TRUE,
   message=FALSE
   # root.dir=wd
)
## Specific config
bedInstance <- config$BED_INSTANCE
bedVersion <- config$BED_VERSION

## Identify the latest ensembl release here: https://www.ensembl.org/
ensembl_org_dir <- grep(
  "_core_",
  rvest::html_table(
    rvest::read_html("https://ftp.ensembl.org/pub/current_mysql/")
  )[[1]]$Name,
  value=TRUE
)
ensembl_org_table <- do.call(rbind, lapply(
  strsplit(ensembl_org_dir, split="_"),
  function(x){
    gv <- sub("/", "", x[length(x)])
    e <- x[length(x)-1]
    core <- which(x=="core")
    org <- paste(x[1:(core-1)], collapse=" ")
    substr(org, 1, 1) <- toupper(substr(org, 1,1))
    return(data.frame(
       organism=org,
       release=e,
       genome_version=gv
    ))
  }
))
ensembl_org_table <- ensembl_org_table[
  order(as.numeric(ensembl_org_table$genome_version), decreasing=T),
]
ensembl_org_table <- ensembl_org_table[
  which(!duplicated(ensembl_org_table$organism)),
]

ensembl_release <- unique(ensembl_org_table$release)
stopifnot(length(ensembl_release)==1)

org <-"Homo sapiens"
ensembl_Hsapiens <- list(
    release=ensembl_release,
    organism=org,
    gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)],
    gdbCref=c(                      # Gene cross-references DBs
        "HGNC"="HGNC",
        "EntrezGene"="EntrezGene",
        "Vega_gene"="Vega_gene",
        "Ens_Hs_gene"="Ens_gene"
    ),
    gdbAss=c(                       # Gene associated IDs (DB)
        "miRBase"="miRBase",
        "MIM_GENE"="MIM_GENE",
        "UniGene"="UniGene"
    ),
    tdbCref=c(                      # Transcript cross-references DBs
        "RefSeq_mRNA"="RefSeq",
        "RefSeq_ncRNA"="RefSeq",
        "RefSeq_mRNA_predicted"="RefSeq",
        "RefSeq_ncRNA_predicted"="RefSeq",
        "Vega_transcript"="Vega_transcript",
        "Ens_Hs_transcript"="Ens_transcript"
    ),
    pdbCref=c(                      # Peptide cross-references DBs
        "RefSeq_peptide"="RefSeq_peptide",
        "RefSeq_peptide_predicted"="RefSeq_peptide",
        "Uniprot/SPTREMBL"="Uniprot",
        "Uniprot/SWISSPROT"="Uniprot",
        "Vega_translation"="Vega_translation",
        "Ens_Hs_translation"="Ens_translation"
    ),
    canChromosomes=c(1:22, "X", "Y", "MT")
)
org <- "Mus musculus"
ensembl_Mmusculus <- list(
    release=ensembl_release,
    organism=org,
    gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)],
    gdbCref=c(                      # Gene cross-references DBs
        "MGI"="MGI",
        "EntrezGene"="EntrezGene",
        "Vega_gene"="Vega_gene",
        "Ens_Mm_gene"="Ens_gene"
    ),
    gdbAss=c(                       # Gene associated IDs (DB)
        "miRBase"="miRBase",
        "UniGene"="UniGene"
    ),
    tdbCref=c(                      # Transcript cross-references DBs
        "RefSeq_mRNA"="RefSeq",
        "RefSeq_ncRNA"="RefSeq",
        "RefSeq_mRNA_predicted"="RefSeq",
        "RefSeq_ncRNA_predicted"="RefSeq",
        "Vega_transcript"="Vega_transcript",
        "Ens_Mm_transcript"="Ens_transcript"
    ),
    pdbCref=c(                      # Peptide cross-references DBs
        "RefSeq_peptide"="RefSeq_peptide",
        "RefSeq_peptide_predicted"="RefSeq_peptide",
        "Uniprot/SPTREMBL"="Uniprot",
        "Uniprot/SWISSPROT"="Uniprot",
        "Vega_translation"="Vega_translation",
        "Ens_Mm_translation"="Ens_translation"
    ),
    canChromosomes=c(1:19, "X", "Y", "MT")
)
org <- "Rattus norvegicus"
ensembl_Rnorvegicus <- list(
    release=ensembl_release,
    organism=org,
    gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)],
    gdbCref=c(                      # Gene cross-references DBs
        "RGD"="RGD",
        "EntrezGene"="EntrezGene",
        "Vega_gene"="Vega_gene",
        "Ens_Rn_gene"="Ens_gene"
    ),
    gdbAss=c(                       # Gene associated IDs (DB)
        "miRBase"="miRBase",
        "UniGene"="UniGene"
    ),
    tdbCref=c(                      # Transcript cross-references DBs
        "RefSeq_mRNA"="RefSeq",
        "RefSeq_ncRNA"="RefSeq",
        "RefSeq_mRNA_predicted"="RefSeq",
        "RefSeq_ncRNA_predicted"="RefSeq",
        "Vega_transcript"="Vega_transcript",
        "Ens_Rn_transcript"="Ens_transcript"
    ),
    pdbCref=c(                      # Peptide cross-references DBs
        "RefSeq_peptide"="RefSeq_peptide",
        "RefSeq_peptide_predicted"="RefSeq_peptide",
        "Uniprot/SPTREMBL"="Uniprot",
        "Uniprot/SWISSPROT"="Uniprot",
        "Vega_translation"="Vega_translation",
        "Ens_Rn_translation"="Ens_translation"
    ),
    canChromosomes=c(1:20, "X", "Y", "MT")
)
org <- "Sus scrofa"
ensembl_Sscrofa <- list(
    release=ensembl_release,
    organism=org,
    gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)],
    gdbCref=c(                      # Gene cross-references DBs
        "EntrezGene"="EntrezGene",
        "Vega_gene"="Vega_gene",
        "Ens_Ss_gene"="Ens_gene"
    ),
    gdbAss=c(                       # Gene associated IDs (DB)
        "miRBase"="miRBase",
        "UniGene"="UniGene"
    ),
    tdbCref=c(                      # Transcript cross-references DBs
        "RefSeq_mRNA"="RefSeq",
        "RefSeq_ncRNA"="RefSeq",
        "RefSeq_mRNA_predicted"="RefSeq",
        "RefSeq_ncRNA_predicted"="RefSeq",
        "Vega_transcript"="Vega_transcript",
        "Ens_Ss_transcript"="Ens_transcript"
    ),
    pdbCref=c(                      # Peptide cross-references DBs
        "RefSeq_peptide"="RefSeq_peptide",
        "RefSeq_peptide_predicted"="RefSeq_peptide",
        "Uniprot/SPTREMBL"="Uniprot",
        "Uniprot/SWISSPROT"="Uniprot",
        "Vega_translation"="Vega_translation",
        "Ens_Ss_translation"="Ens_translation"
    ),
    canChromosomes=c(1:18, "X", "Y", "MT")
)
org <- "Danio rerio"
ensembl_Drerio <- list(
    release=ensembl_release,
    organism=org,
    gv=ensembl_org_table$genome_version[which(ensembl_org_table$organism==org)],
    gdbCref=c(                      # Gene cross-references DBs
        "EntrezGene"="EntrezGene",
        "ZFIN_ID"="ZFIN_gene",
        "Vega_gene"="Vega_gene",
        "Ens_Dr_gene"="Ens_gene"
    ),
    gdbAss=c(                       # Gene associated IDs (DB)
        "miRBase"="miRBase",
        "UniGene"="UniGene"
    ),
    tdbCref=c(                      # Transcript cross-references DBs
        "RefSeq_mRNA"="RefSeq",
        "RefSeq_ncRNA"="RefSeq",
        "RefSeq_mRNA_predicted"="RefSeq",
        "RefSeq_ncRNA_predicted"="RefSeq",
        "Vega_transcript"="Vega_transcript",
        "Ens_Dr_transcript"="Ens_transcript"
    ),
    pdbCref=c(                      # Peptide cross-references DBs
        "RefSeq_peptide"="RefSeq_peptide",
        "RefSeq_peptide_predicted"="RefSeq_peptide",
        "Uniprot/SPTREMBL"="Uniprot",
        "Uniprot/SWISSPROT"="Uniprot",
        "Vega_translation"="Vega_translation",
        "Ens_Dr_translation"="Ens_translation"
    ),
    canChromosomes=c(1:25, "MT")
)
## General config
reDumpThr <- as.difftime(config$SOURCE_REDUMP_THR, units="days")
curDate <- Sys.Date()

BED initialization

BED is based on Neo4j.

The S01-NewBED-Container.sh shows how to run it in a docker container.

Because the import functions use massively the LOAD CSV Neo4j query, the feeding of the BED database can only be down from the computer hosting the Neo4j relevant instance.

The chunk below shows how to connect to BED. In this example, neo4j authentication is disabled.

connectToBed(
   url=sprintf("localhost:%s", config$NJ_HTTP_PORT),
   remember=FALSE,
   useCache=TRUE,
   importPath=config$BED_IMPORT
)
clearBedCache(force = TRUE, hard = TRUE)

Check empty DB

Do not go further if your BED DB is not empty.

dbSize <- bedCall(cypher, 'MATCH (n) RETURN count(n)')[,1]
if(dbSize!=0){
    stop("BED DB is not empty ==> clean it before loading the content below")
}

Set BED instance and version

print(bedInstance)
print(bedVersion)
BED:::setBedVersion(bedInstance=bedInstance, bedVersion=bedVersion)

Load Data model

Start: r Sys.time()

BED:::loadBedModel()

End: r Sys.time()

Loading taxonomy from NCBI

Information is downloaded if older than r reDumpThr days according to the reDumpThr object.

Start: r Sys.time()

BED:::loadNcbiTax(
    reDumpThr=reDumpThr,
    ddir=wd,
    orgOfInt=c(
       "Homo sapiens", "Rattus norvegicus", "Mus musculus",
       "Sus scrofa", "Danio rerio"
      ),
    curDate=curDate
)

End: r Sys.time()

Loading data from Ensembl

Register Ensembl DBs

Genes

BED:::registerBEDB(
    name="Ens_gene",
    description="Ensembl gene",
    currentVersion=ensembl_release,
    idURL='http://www.ensembl.org/id/%s'
)

Transcripts

BED:::registerBEDB(
    name="Ens_transcript",
    description="Ensembl transcript",
    currentVersion=ensembl_release,
    idURL='http://www.ensembl.org/id/%s'
)

Peptides

BED:::registerBEDB(
    name="Ens_translation",
    description="Ensembl peptides",
    currentVersion=ensembl_release,
    idURL='http://www.ensembl.org/id/%s'
)

Danio rerio

ensembl <- ensembl_Drerio
print(ensembl)

Genes

Start: r Sys.time()

BED:::getEnsemblGeneIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$gdbCref,
    dbAss=ensembl$gdbAss,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Transcripts

Start: r Sys.time()

BED:::getEnsemblTranscriptIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$tdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Peptides

Start: r Sys.time()

BED:::getEnsemblPeptideIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$pdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Homo sapiens

ensembl <- ensembl_Hsapiens
print(ensembl)

Genes

Start: r Sys.time()

BED:::getEnsemblGeneIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$gdbCref,
    dbAss=ensembl$gdbAss,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Transcripts

Start: r Sys.time()

BED:::getEnsemblTranscriptIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$tdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Peptides

Start: r Sys.time()

BED:::getEnsemblPeptideIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$pdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Mus musculus

ensembl <- ensembl_Mmusculus
print(ensembl)

Genes

Start: r Sys.time()

BED:::getEnsemblGeneIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$gdbCref,
    dbAss=ensembl$gdbAss,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Transcripts

Start: r Sys.time()

BED:::getEnsemblTranscriptIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$tdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Peptides

Start: r Sys.time()

BED:::getEnsemblPeptideIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$pdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Rattus norvegicus

ensembl <- ensembl_Rnorvegicus
print(ensembl)

Genes

Start: r Sys.time()

BED:::getEnsemblGeneIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$gdbCref,
    dbAss=ensembl$gdbAss,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Transcripts

Start: r Sys.time()

BED:::getEnsemblTranscriptIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$tdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Peptides

Start: r Sys.time()

BED:::getEnsemblPeptideIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$pdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Sus scrofa

ensembl <- ensembl_Sscrofa
print(ensembl)

Genes

Start: r Sys.time()

BED:::getEnsemblGeneIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$gdbCref,
    dbAss=ensembl$gdbAss,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Transcripts

Start: r Sys.time()

BED:::getEnsemblTranscriptIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$tdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Peptides

Start: r Sys.time()

BED:::getEnsemblPeptideIds(
    organism=ensembl$organism,
    release=ensembl$release,
    gv=ensembl$gv,
    ddir=wd,
    dbCref=ensembl$pdbCref,
    canChromosomes=ensembl$canChromosomes
)
gc()

End: r Sys.time()

Loading data from NCBI

Information is downloaded if older than r reDumpThr days according to the reDumpThr object.

Register NCBI DBs

BED:::dumpNcbiDb(
  taxOfInt = c(), reDumpThr=reDumpThr,
  ddir=wd,
  toLoad=c(), curDate=curDate
)

Genes

BED:::registerBEDB(
    name="EntrezGene",
    description="NCBI gene",
    currentVersion=format(dumpDate, "%Y%m%d"),
    idURL='https://www.ncbi.nlm.nih.gov/gene/%s'
)

Transcripts

BED:::registerBEDB(
    name="RefSeq",
    description="NCBI nucleotide",
    currentVersion=format(dumpDate, "%Y%m%d"),
    idURL='https://www.ncbi.nlm.nih.gov/nuccore/%s'
)

Peptides

BED:::registerBEDB(
    name="RefSeq_peptide",
    description="NCBI protein",
    currentVersion=format(dumpDate, "%Y%m%d"),
    idURL='https://www.ncbi.nlm.nih.gov/protein/%s'
)

Danio rerio data

Start: r Sys.time()

BED:::getNcbiGeneTransPep(
    organism="Danio rerio",
    ddir=wd,
    curDate=curDate
)
gc()

End: r Sys.time()

Homo sapiens data

Start: r Sys.time()

BED:::getNcbiGeneTransPep(
    organism="Homo sapiens",
    ddir=wd,
    curDate=curDate
)
gc()

End: r Sys.time()

Mus musculus data

Start: r Sys.time()

BED:::getNcbiGeneTransPep(
    organism="Mus musculus",
    ddir=wd,
    curDate=curDate
)
gc()

End: r Sys.time()

Rattus norvegicus data

Start: r Sys.time()

BED:::getNcbiGeneTransPep(
    organism="Rattus norvegicus",
    ddir=wd,
    curDate=curDate
)
gc()

End: r Sys.time()

Sus scrofa data

Start: r Sys.time()

BED:::getNcbiGeneTransPep(
    organism="Sus scrofa",
    ddir=wd,
    curDate=curDate
)
gc()

End: r Sys.time()

Direct cross-references with Uniprot

Start: r Sys.time()

message("Direct cross-references with Uniprot")
BED:::dumpNcbiDb(
  taxOfInt="",
  reDumpThr=Inf,
  ddir=wd,
  toLoad="gene_refseq_uniprotkb_collab",
  curDate=Sys.Date()
)
gene_refseq_uniprotkb_collab <- gene_refseq_uniprotkb_collab[
   gene_refseq_uniprotkb_collab$method %in% c("uniprot", "identical"),
]
gene_refseq_uniprotkb_collab$NCBI_protein_accession <- sub(
   "[.].*$", "", gene_refseq_uniprotkb_collab$NCBI_protein_accession
)
for(org in listOrganisms()){
  message("   ", org)
  curRS <- getBeIds(
    be="Peptide", source="RefSeq_peptide", organism=org,
    restricted=TRUE
  )
  toAdd <- gene_refseq_uniprotkb_collab[
      which(
         gene_refseq_uniprotkb_collab$NCBI_protein_accession %in% curRS$id
      ),
  ]
  ## External DB IDs
  toImport <- unique(toAdd[, "UniProtKB_protein_accession", drop=F])
  colnames(toImport) <- "id"
  BED:::loadBE(
      d=toImport, be="Peptide",
      dbname="Uniprot",
      taxId=NA
  )
  ## The cross references
  toImport <- toAdd[
    , c("NCBI_protein_accession", "UniProtKB_protein_accession")
  ]
  colnames(toImport) <- c("id1", "id2")
  BED:::loadCorrespondsTo(
      d=toImport,
      db1="RefSeq_peptide",
      db2="Uniprot",
      be="Peptide"
  )
}

End: r Sys.time()

Loading data from HGNC

DB information

dbname <- "HGNC"
hgnc_date <- jsonlite::read_json(
   "https://www.genenames.org/cgi-bin/statistics/db-last-updated"
)$date
BED:::registerBEDB(
    name=dbname,
    description="HUGO Gene Nomenclature Committee",
    currentVersion=hgnc_date,
    idURL='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:%s'
)

hgnc <- read_tsv(
   file="https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt",
   col_types = "ccccccccccccccDDDDcccccccccccccccccccccccccccccccccccc"
)
hgnc_genes <- hgnc |>
  select(
    hgnc_id, symbol, name, locus_group, locus_type, status,
    location #, location_sortable
  ) |>
  mutate(id = str_remove(hgnc_id, "^HGNC:")) |>
  relocate(id, .after = "hgnc_id") |>
  distinct()

get_hgnc_xref <- function(colname){
  toRet <- hgnc |>
    select(all_of(c("hgnc_id", colname)))
  values <- toRet |> pull(colname) |> strsplit("[|]")
  toRet <- tibble(
    hgnc_id=rep(toRet$hgnc_id, lengths(values)),
    value = unlist(values)
  )
  return(toRet)
}

hgnc_alias_symbols <- get_hgnc_xref("alias_symbol") |>
  filter(!is.na(value)) |>
  rename("symbol"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_alias_names <- get_hgnc_xref("alias_name") |>
  filter(!is.na(value)) |>
  rename("name"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_previous_symbols <- get_hgnc_xref("prev_symbol") |>
  filter(!is.na(value)) |>
  rename("symbol"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_previous_names <- get_hgnc_xref("prev_name") |>
  filter(!is.na(value)) |>
  rename("name"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_entrez <- get_hgnc_xref("entrez_id") |>
  filter(!is.na(value)) |>
  rename("entrez"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_ensembl <- get_hgnc_xref("ensembl_gene_id") |>
  filter(!is.na(value)) |>
  rename("ensembl"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_omim <- get_hgnc_xref("omim_id") |>
  filter(!is.na(value)) |>
  rename("omim"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
hgnc_orphanet <- get_hgnc_xref("orphanet") |>
  filter(!is.na(value)) |>
  rename("orphanet"="value") |> 
  mutate(id = str_remove(hgnc_id, "^HGNC:"))
# hgnc_mgd <- get_hgnc_xref("mgd_id") |>
#   filter(!is.na(value)) |>
#   rename("mgd_id"="value") |>
#   mutate("mgd" = str_remove(mgd_id, "MGI:"))
# hgnc_rgd <- get_hgnc_xref("rgd_id") |>
#   filter(!is.na(value)) |>
#   rename("rgd_id"="value") |>
#   mutate("rgd" = str_remove(rgd_id, "RGD:"))

Identifiers

toLoad <- hgnc_genes |> select(id)
BED:::loadBE(
   toLoad, be = "Gene",
   dbname = dbname, version = hgnc_date,
   taxId = "9606"
)

Symbols

toLoad <- dplyr::bind_rows(
   hgnc_genes |> 
      dplyr::select(id, symbol) |> 
      dplyr::mutate(canonical = TRUE),
   hgnc_alias_symbols |> 
      dplyr::select(id, symbol) |> 
      dplyr::mutate(canonical = FALSE),
   hgnc_previous_symbols |> 
      dplyr::select(id, symbol) |> 
      dplyr::mutate(canonical = FALSE)
)|>
   dplyr::distinct(id, symbol, .keep_all = TRUE)
BED:::loadBESymbols(toLoad, be = "Gene", dbname = dbname)

Names

toLoad <- dplyr::bind_rows(
   hgnc_genes |> 
      dplyr::select(id, name) |> 
      dplyr::mutate(canonical = TRUE),
   hgnc_alias_names |> 
      dplyr::select(id, name) |> 
      dplyr::mutate(canonical = FALSE),
   hgnc_previous_names |> 
      dplyr::select(id, name) |> 
      dplyr::mutate(canonical = FALSE)
)|>
   dplyr::distinct(id, name, .keep_all = TRUE)
BED:::loadBENames(toLoad, be = "Gene", dbname = dbname)

Cross references

EntrezGene

crdb <- "EntrezGene"
toLoad <- hgnc_entrez |> 
   dplyr::select(id = entrez) |> 
   dplyr::distinct()
BED:::loadBE(
   d=toLoad, be="Gene",
   dbname=crdb,
   taxId=NA
)
## The cross references
toLoad <- hgnc_entrez |> 
   dplyr::select(id1 = entrez, id2 = id) |> 
   dplyr::distinct()
BED:::loadCorrespondsTo(
   d=as.data.frame(toLoad),
   db1=crdb,
   db2=dbname,
   be="Gene"
)

Ens_gene

crdb <- "Ens_gene"
toLoad <- hgnc_ensembl |> 
   dplyr::select(id = ensembl) |> 
   dplyr::distinct()
BED:::loadBE(
   d=toLoad, be="Gene",
   dbname=crdb,
   taxId=NA
)
## The cross references
toLoad <- hgnc_ensembl |> 
   dplyr::select(id1 = ensembl, id2 = id) |> 
   dplyr::distinct()
BED:::loadCorrespondsTo(
   d=as.data.frame(toLoad),
   db1=crdb,
   db2=dbname,
   be="Gene"
)

MIM_GENE

crdb <- "MIM_GENE"
toLoad <- hgnc_omim |> 
   dplyr::select(id = omim) |> 
   dplyr::distinct()
BED:::loadBE(
   d=toLoad, be="Gene",
   dbname=crdb,
   taxId=NA
)
## The cross references (associations)
toLoad <- hgnc_omim |> 
   dplyr::select(id1 = omim, id2 = id) |> 
   dplyr::distinct()
BED:::loadIsAssociatedTo(
   d=as.data.frame(toLoad),
   db1=crdb,
   db2=dbname,
   be="Gene"
)

Loading data from Uniprot

Release is defined according to the reldate.txt file on the Uniprot FTP and data is downloaded only if not already done for the current release.

ftp <- "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions"
avRel <- readLines(file.path(ftp, "reldate.txt"), n=1)
avRel <- sub(
    "^UniProt Knowledgebase Release ", "",
    sub(" consists of:$", "", avRel)
)
if(is.na(as.Date(paste0(avRel, "_01"), format="%Y_%m_%d"))){
    print(avRel)
    stop(sprintf("Check reldate.txt file on %s", ftp))
}
BED:::registerBEDB(
    name="Uniprot",
    description="Uniprot",
    currentVersion=avRel,
    idURL='http://www.uniprot.org/uniprot/%s'
)

Danio rerio data

Start: r Sys.time()

BED:::getUniprot(
    organism="Danio rerio", taxDiv="vertebrates", release=avRel, ddir=wd
)
gc()

End: r Sys.time()

Homo sapiens data

Start: r Sys.time()

BED:::getUniprot(
    organism="Homo sapiens", taxDiv="human", release=avRel, ddir=wd
)
gc()

End: r Sys.time()

Mus musculus data

Start: r Sys.time()

BED:::getUniprot(
    organism="Mus musculus", taxDiv="rodents", release=avRel, ddir=wd
)
gc()

End: r Sys.time()

Rattus norvegicus data

Start: r Sys.time()

BED:::getUniprot(
    organism="Rattus norvegicus", taxDiv="rodents", release=avRel, ddir=wd
)
gc()

End: r Sys.time()

Sus scrofa data

Start: r Sys.time()

BED:::getUniprot(
    organism="Sus scrofa", taxDiv="mammals", release=avRel, ddir=wd
)
gc()

End: r Sys.time()

Indirect cross-references with EntrezGene

Start: r Sys.time()

message("Indirect cross-references with Uniprot")
dumpDir <- file.path(wd, "NCBI-gene-DATA")
f <- "gene2accession.gz"
if(file.exists(dumpDir)){
  load(file.path(dumpDir, "dumpDate.rda"))
  message("Last download: ", dumpDate)
  if(curDate - dumpDate > reDumpThr | !file.exists(file.path(dumpDir, f))){
    toDownload <- TRUE
  }else{
    toDownload <- FALSE
  }
}else{
  message("Not downloaded yet")
  toDownload <- TRUE
}
if(toDownload){
  ftp <- "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/"
  dlok <- try(download.file(
    url=paste0(ftp, f),
    destfile=file.path(dumpDir, f),
    method="wget",
    quiet=T
  ), silent=T)
}else{
  message("Existing data are going to be used")
}
cn <- readLines(file.path(dumpDir, f), n=1)
cn <- sub("^#", "", cn)
cn <- unlist(strsplit(cn, split="[ \t]"))
for(org in listOrganisms()){
  message("   ", org)
  tid <- getTaxId(org)
  toAdd <- read.table(
    text=system(
      sprintf("zgrep ^%s %s", tid, file.path(dumpDir, f)),
      intern=TRUE
    ),
    sep="\t",
    header=F,
    stringsAsFactors=F,
    quote="", comment.char=""
  )
  colnames(toAdd) <- cn
  toAdd <- toAdd[
    which(toAdd$tax_id==tid),
    c("tax_id", "GeneID", "protein_accession.version")
  ]
  toAdd$pacc <- sub("[.].*$", "", toAdd$protein_accession.version)
  curUP <- getBeIdConvTable(
    from="Gene", from.source="BEDTech_gene", organism=org,
    to="Peptide", to.source="Uniprot",
    restricted=TRUE
  )
  toAdd <- merge(
    toAdd[,c("GeneID", "pacc")],
    curUP[,c("from", "to")],
    by.x="pacc", by.y="to",
    all=FALSE
  )
  toAdd <- toAdd[,c("from", "GeneID")]
  toAdd$from <- as.character(toAdd$from)
  toAdd$GeneID <- as.character(toAdd$GeneID)
  colnames(toAdd) <- c("id1", "id2")
  BED:::loadIsAssociatedTo(
    d=toAdd,
    db1="BEDTech_gene", db2="EntrezGene",
    be="Gene"
  )
}

End: r Sys.time()

Loading orthologs

Orthologs from Ensembl

Start: r Sys.time()

## Dump data from ensembl
orgOfInt <- c(
   "homo_sapiens", "mus_musculus", "rattus_norvegicus", "sus_scrofa",
   "danio_rerio"
)
data_dir <- file.path(wd, "ensembl_homologies")
dir.create(data_dir, showWarnings = FALSE)
ensembl_homologies <- c()
for(org in orgOfInt){
   ## Proteins
   f_url <- sprintf(
      "https://ftp.ensembl.org/pub/release-%s/tsv/ensembl-compara/homologies/%s/Compara.%s.protein_default.homologies.tsv.gz",
      ensembl_release, org, ensembl_release
   )
   f_name <- sprintf(
      "%s.Compara.%s.protein_default.homologies.tsv.gz",
      org, ensembl_release
   )
   f_path <- file.path(data_dir, f_name)
   if(!file.exists(f_path)){
      download.file(url = f_url, destfile = f_path)
   }
   to_add <- readr::read_tsv(
      f_path, col_types = "c"
   )
   to_add <- to_add |> 
      filter(
         is_high_confidence == "1",
         species %in% orgOfInt,
         homology_species %in% orgOfInt,
         species != homology_species
      ) |> 
      select(gene_stable_id, homology_gene_stable_id, species, homology_species)
   ensembl_homologies <- bind_rows(
      ensembl_homologies,
      to_add
   )
   ## ncRNA
   f_url <- sprintf(
      "https://ftp.ensembl.org/pub/release-%s/tsv/ensembl-compara/homologies/%s/Compara.%s.ncrna_default.homologies.tsv.gz",
      ensembl_release, org, ensembl_release
   )
   f_name <- sprintf(
      "%s.Compara.%s.ncrna_default.homologies.tsv.gz",
      org, ensembl_release
   )
   f_path <- file.path(data_dir, f_name)
   if(!file.exists(f_path)){
      download.file(url = f_url, destfile = f_path)
   }
   to_add <- readr::read_tsv(
      f_path, col_types = "c"
   )
   to_add <- to_add |> 
      filter(
         is_high_confidence == "1",
         species %in% orgOfInt,
         homology_species %in% orgOfInt,
         species != homology_species
      ) |> 
      select(gene_stable_id, homology_gene_stable_id, species, homology_species)
   ensembl_homologies <- bind_rows(
      ensembl_homologies,
      to_add
   )

}
toImport <- ensembl_homologies |> 
   dplyr::select(id1 = gene_stable_id, id2 = homology_gene_stable_id) |> 
   as.data.frame()
BED:::loadIsHomologOf(
   d=toImport,
   db1="Ens_gene", db2="Ens_gene",
   be="Gene"
)

End: r Sys.time()

Orthologs from NCBI

Start: r Sys.time()

#####################################
gdbname <- "EntrezGene"
taxOfInt <- unlist(lapply(
    c(
       "Homo sapiens", "Mus musculus", "Rattus norvegicus",
       "Sus scrofa", "Danio rerio"
    ),
    getTaxId
))
for(i in 1:length(taxOfInt)){
   BED:::dumpNcbiDb(
       taxOfInt=taxOfInt[i],
       reDumpThr=reDumpThr,
       ddir=wd,
       toLoad=c("gene_orthologs"),
       curDate=curDate
   )
   toImport <- gene_orthologs[
       which(
           gene_orthologs$tax_id %in% taxOfInt &
           gene_orthologs$Other_tax_id %in% taxOfInt &
           gene_orthologs$relationship == "Ortholog"
       ),
       c("GeneID", "Other_GeneID")
   ]
   if(nrow(toImport)>0){
      colnames(toImport) <- c("id1", "id2")
      toImport <- dplyr::mutate_all(toImport, as.character)
      BED:::loadIsHomologOf(
          d=toImport,
          db1=gdbname, db2=gdbname,
          be="Gene"
      )
   }
}
gc()

End: r Sys.time()

Loading probes

Probes from GEO

library(GEOquery)
geoPath <- file.path(wd, "geo")
dir.create(geoPath, showWarnings=FALSE)

GPL1708: Agilent-012391 Whole Human Genome Oligo Microarray G4112A (Feature Number version)

Start: r Sys.time()

## Import plateform
platName <- "GPL1708"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Entrez
d <- Table(gds)
toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "GENE")]
colnames(toImport) <- c("probeID", "id")
toImport$probeID <- as.character(toImport$probeID)
toImport$id <- as.character(toImport$id)
toImport <- toImport[which(!is.na(toImport$id)),]
toImport <- unique(toImport)
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)
# ## Import mapping with UniGene
# toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "UNIGENE_ID")]
# colnames(toImport) <- c("probeID", "id")
# toImport$probeID <- as.character(toImport$probeID)
# toImport$id <- as.character(toImport$id)
# toImport <- toImport[which(!is.na(toImport$id) & toImport$id!=""),]
# dbname <- "UniGene"
# ##
# BED:::loadProbes(
#     d=toImport,
#     be=be,
#     platform=platName,
#     dbname=dbname
# )

End: r Sys.time()

GPL6480: Agilent-014850 Whole Human Genome Microarray 4x44K G4112F (Probe Name version)

Start: r Sys.time()

## Import plateform
platName <- "GPL6480"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Entrez
d <- Table(gds)
toImport <- d[which(!is.na(d$ID)), c("ID", "GENE")]
colnames(toImport) <- c("probeID", "id")
toImport$probeID <- as.character(toImport$probeID)
toImport$id <- as.character(toImport$id)
toImport <- toImport[which(!is.na(toImport$id)),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)
# ## Import mapping with UniGene
# toImport <- d[which(!is.na(d$ID)), c("ID", "UNIGENE_ID")]
# colnames(toImport) <- c("probeID", "id")
# toImport$probeID <- as.character(toImport$probeID)
# toImport$id <- as.character(toImport$id)
# toImport <- toImport[which(!is.na(toImport$id)),]
# dbname <- "UniGene"
# ##
# BED:::loadProbes(
#     d=toImport,
#     be=be,
#     platform=platName,
#     dbname=dbname
# )

End: r Sys.time()

GPL570: Affymetrix Human Genome U133 Plus 2.0 Array

Start: r Sys.time()

## Import plateform
platName <- "GPL570"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL571: Affymetrix Human Genome U133A 2.0 Array

Start: r Sys.time()

## Import plateform
platName <- "GPL571"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL13158: Affymetrix HT HG-U133+ PM Array Plate

Start: r Sys.time()

## Import plateform
platName <- "GPL13158"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL96: Affymetrix Human Genome U133A Array

Start: r Sys.time()

## Import plateform
platName <- "GPL96"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL1261: Affymetrix Mouse Genome 430 2.0 Array

Start: r Sys.time()

## Import plateform
platName <- "GPL1261"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL1355: Affymetrix Rat Genome 230 2.0 Array

Start: r Sys.time()

## Import plateform
platName <- "GPL1355"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
    as.character(d$ENTREZ_GENE_ID),
    split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL10558: Illumina HumanHT-12 V4.0 expression beadchip

Start: r Sys.time()

## Import plateform
platName <- "GPL10558"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL6947: Illumina HumanHT-12 V3.0 expression beadchip

Start: r Sys.time()

## Import plateform
platName <- "GPL6947"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL6885: Illumina MouseRef-8 v2.0 expression beadchip

Start: r Sys.time()

## Import plateform
platName <- "GPL6885"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
d <- Table(gds)
# e <- getBeIds(
#    be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE
# )
# sum(d$Entrez_Gene_ID %in% e$id) < sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id)
be <- "Transcript"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
toImport <- d[,c("RefSeq_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,1] <- sub("[.].*$", "", toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "RefSeq"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL6887: Illumina MouseWG-6 v2.0 expression beadchip

Start: r Sys.time()

## Import plateform
platName <- "GPL6887"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
d <- Table(gds)
# e <- getBeIds(
#    be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE
# )
# sum(d$Entrez_Gene_ID %in% e$id) > sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id)
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
# toImport[,1] <- sub("[.].*$", "", toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

GPL6101: Illumina ratRef-12 v1.0 expression beadchip

Start: r Sys.time()

## Import plateform
platName <- "GPL6101"
gds <- getGEO(platName, destdir=geoPath)
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
    d=toImport,
    be=be,
    platform=platName,
    dbname=dbname
)

End: r Sys.time()

Probes from Ensembl

Start: r Sys.time()

to_exclude <- c(
   "affy_huex_1_0_st_v2",
   "affy_moex_1_0_st_v1",
   "affy_raex_1_0_st_v1"
)
dump_ensembl_func <- function(ens_def){
   base_url <- sprintf(
      "https://ftp.ensembl.org/pub/current_mysql/%s_funcgen_%s_%s",
      stringr::str_replace(tolower(ens_def$organism), " ", "_"),
      ens_def$release, ens_def$gv
   )
   data_dir <- file.path(
      wd,
      sprintf(
         "%s_funcgen_%s_%s",
         stringr::str_replace(tolower(ens_def$organism), " ", "_"),
         ens_def$release, ens_def$gv
      )
   )
   dir.create(data_dir, showWarnings = FALSE)
   to_download <- c(
      "array.txt.gz",
      "probe.txt.gz",
      "probe_set.txt.gz",
      "probe_transcript.txt.gz",
      "probe_set_transcript.txt.gz"
   )
   for(f in to_download){
      fp <- file.path(data_dir, f)
      if(!file.exists(fp)){
         download.file(file.path(base_url, f), fp)
      }
   }
}
load_ensembl_probes <- function(ens_def){
   data_dir <- file.path(
      wd,
      sprintf(
         "%s_funcgen_%s_%s",
         stringr::str_replace(tolower(ens_def$organism), " ", "_"),
         ens_def$release, ens_def$gv
      )
   )
   array <- readr::read_tsv(
      file.path(data_dir, "array.txt.gz"),
      col_names = c(
         "array_id",
         "name",
         "format",
         "vendor",
         "description",
         "type",
         "class",
         "is_probeset_array",
         "is_linked_array",
         "has_sense_interrogation"
      ),
      col_types = "ccccccclll",
      na = "\\N"
   )
   array <- array |>
      dplyr::filter(
         format %in% c("EXPRESSION"),
         !.data$name %in% to_exclude
      )
   probe_arrays <- array |> filter(!is_probeset_array) |> pull(array_id)
   probe <- readr::read_tsv(
      file.path(data_dir, "probe.txt.gz"),
      col_names = c(
         "probe_id",
         "probe_set_id",
         "name",
         "length",
         "array_chip_id",
         "class",
         "description",
         "probe_seq_id"
      ),
      col_types = "cccnccccc",
      na = "\\N"
   )
   probe <- probe |> filter(array_chip_id %in% probe_arrays)
   probe_set <- readr::read_tsv(
      file.path(data_dir, "probe_set.txt.gz"),
      col_names = c(
         "probe_set_id",
         "name",
         "size",
         "family",
         "array_chip_id"
      ),
      col_types = "cccncc",
      na = "\\N"
   )
   probe_transcript <- readr::read_tsv(
      file.path(data_dir, "probe_transcript.txt.gz"),
      col_names = c(
         "probe_transcript_id",
         "probe_id",
         "stable_id",
         "description"
      ),
      col_types = "cccc",
      na = "\\N"
   )
   probe_transcript <- probe_transcript |> 
      filter(probe_id %in% probe$probe_id)
   probe_set_transcript <- readr::read_tsv(
      file.path(data_dir, "probe_set_transcript.txt.gz"),
      col_names = c(
         "probe_set_transcript_id",
         "probe_set_id",
         "stable_id",
         "description"
      ),
      col_types = "cccc",
      na = "\\N"
   )
   be <- "Transcript"
   dbname <- "Ens_transcript"
   for(i in 1:nrow(array)){
      message("   ", i, "/", nrow(array), " platforms")
      message("      ", Sys.time())
      platName <- tolower(paste(
         str_replace(ens_def$organism, " ", "_"),
         array$vendor[i], array$name[i], sep = "_"
      ))
      is_probe_set <- array$is_probeset_array[i]
      platDesc <- paste(
         array$vendor[i],
         array$name[i],
         array$type[i],
         ifelse(is_probe_set, "probe set", "probe"),
         ens_def$organism,
         "(Ensembl mapping)"
      )
      array_id <- array$array_id[i]
      if(is_probe_set){
         toImport <- probe_set |>
            dplyr::filter(.data$array_chip_id==!!array_id) |> 
            dplyr::select("probe_set_id", "name") |> 
            dplyr::inner_join(
               probe_set_transcript |>
                  dplyr::select("probe_set_id", "stable_id"),
               by = "probe_set_id"
            ) |> 
            dplyr::select("probeID" = "name", "id" = "stable_id") |> 
            dplyr::distinct()
      }else{
         toImport <- probe |>
            dplyr::filter(.data$array_chip_id==!!array_id) |> 
            dplyr::select("probe_id", "name") |> 
            dplyr::inner_join(
               probe_transcript |> dplyr::select("probe_id", "stable_id"),
               by = "probe_id"
            ) |> 
            dplyr::select("probeID" = "name", "id" = "stable_id") |> 
            dplyr::distinct()
      }
      if(nrow(toImport) > 0){
         ## Platform description
         BED:::loadPlf(name=platName, description=platDesc, be=be)
         ## Import mapping with Ens_transcript
         BED:::loadProbes(
            d=toImport,
            be=be,
            platform=platName,
            dbname=dbname
         )
      }
   }
}

message("Ensembl probes for Drerio")
dump_ensembl_func(ensembl_Drerio)
load_ensembl_probes(ensembl_Drerio)
message("Ensembl probes for Hsapiens")
dump_ensembl_func(ensembl_Hsapiens)
load_ensembl_probes(ensembl_Hsapiens)
message("Ensembl probes for Mmusculus")
dump_ensembl_func(ensembl_Mmusculus)
load_ensembl_probes(ensembl_Mmusculus)
message("Ensembl probes for Rnorvegicus")
dump_ensembl_func(ensembl_Rnorvegicus)
load_ensembl_probes(ensembl_Rnorvegicus)
message("Ensembl probes for Sscrofa")
dump_ensembl_func(ensembl_Sscrofa)
load_ensembl_probes(ensembl_Sscrofa)

End: r Sys.time()

Other information

Databases ID URL

otherIdURL <- list(
    # "HGNC"='http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=%s',
    # "HGNC"='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:%s',
    "miRBase"='http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s',
    "Vega_gene"='http://vega.sanger.ac.uk/id/%s',
    "UniGene"='https://www.ncbi.nlm.nih.gov/unigene?term=%s',
    "Vega_transcript"='http://vega.sanger.ac.uk/id/%s',
    "MGI"='http://www.informatics.jax.org/marker/MGI:%s',
    "Vega_translation"='http://vega.sanger.ac.uk/id/%s',
    "RGD"='https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=%s',
    "MIM_GENE"='http://www.omim.org/entry/%s',
    "ZFIN_gene"='http://zfin.org/%s'
)
for(db in names(otherIdURL)){
    BED:::registerBEDB(
        name=db,
        idURL=otherIdURL[[db]]
    )   
}

Load Lucene Indexes

Start: r Sys.time()

BED:::loadLuceneIndexes()

End: r Sys.time()

Session info

clearBedCache(force = TRUE, hard = TRUE)
sessionInfo()

patzaw/BED documentation built on July 5, 2025, 5:23 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

patzaw/BED Biological Entity Dictionary (BED)

In patzaw/BED: Biological Entity Dictionary (BED)

Introduction

BED initialization

Check empty DB

Set BED instance and version

Load Data model

Loading taxonomy from NCBI

Loading data from Ensembl

Register Ensembl DBs

Genes

Transcripts

Peptides

Danio rerio

Genes

Transcripts

Peptides

Homo sapiens

Genes

Transcripts

Peptides

Mus musculus

Genes

Transcripts

Peptides

Rattus norvegicus

Genes

Transcripts

Peptides

Sus scrofa

Genes

Transcripts

Peptides

Loading data from NCBI

Register NCBI DBs

Genes

Transcripts

Peptides

Danio rerio data

Homo sapiens data

Mus musculus data

Rattus norvegicus data

Sus scrofa data

Direct cross-references with Uniprot

Loading data from HGNC

DB information

Identifiers

Symbols

Names

Cross references

EntrezGene

Ens_gene

MIM_GENE

Loading data from Uniprot

Danio rerio data

Homo sapiens data

Mus musculus data

Rattus norvegicus data

Sus scrofa data

Indirect cross-references with EntrezGene

Loading orthologs

Orthologs from Ensembl

Orthologs from NCBI

Loading probes

Probes from GEO

GPL1708: Agilent-012391 Whole Human Genome Oligo Microarray G4112A (Feature Number version)

GPL6480: Agilent-014850 Whole Human Genome Microarray 4x44K G4112F (Probe Name version)

GPL570: Affymetrix Human Genome U133 Plus 2.0 Array

GPL571: Affymetrix Human Genome U133A 2.0 Array

GPL13158: Affymetrix HT HG-U133+ PM Array Plate

GPL96: Affymetrix Human Genome U133A Array

GPL1261: Affymetrix Mouse Genome 430 2.0 Array

GPL1355: Affymetrix Rat Genome 230 2.0 Array

GPL10558: Illumina HumanHT-12 V4.0 expression beadchip

GPL6947: Illumina HumanHT-12 V3.0 expression beadchip

GPL6885: Illumina MouseRef-8 v2.0 expression beadchip

GPL6887: Illumina MouseWG-6 v2.0 expression beadchip

GPL6101: Illumina ratRef-12 v1.0 expression beadchip

Probes from Ensembl

Other information

patzaw/BED
Biological Entity Dictionary (BED)