title: "Biological Entity Dictionary (BED): Feeding the DB" author: "Patrice Godard" date: "January 14 2024" abstract: "Dump source identifiers related information and integrate content in BED" output: html_document: fig_width: 9 fig_height: 5 keep_md: yes number_sections: yes theme: cerulean toc: yes toc_float: yes editor_options: chunk_output_type: console
This document shows how to feed the Biological Entity Dictionary (BED).
It can be adapted according to specific needs and DB access.
The BED functions used to feed the DB are not exported to avoid
unintended modifications of the DB. To call them, they are preceded
by BED:::
.
In this example several source databases are dumped and their content
is integrated in BED.
Some helper functions are provided to get information from famous databases.
The following chunk is used to configure source versions.
The reDumpThr
object is used to define time intervals during which some
data sources should not be re-downloaded.
##
library(knitr)
library(BED)
##
# if("metabaser" %in% rownames(installed.packages())){
# source("helpers/loadMBObjects.R")
# }else{
# stop("The Clarivate analytics metabaser package is not installed.")
# }
# library(TKCat)
# k <- chTKCat("bel040344", password="")
.get_tk_headers <- do.call(function(){
## Choose the relevant credentials
credentials <- readRDS("~/etc/kmt_authorization.rds")
credentials$refresh()
return(function(){
if(!credentials$validate()){
credentials$refresh()
}
list(
"Authorization"=paste("Bearer", credentials$credentials$access_token)
)
})
}, list())
library(TKCat)
.tkcon <- chTKCat(
"tkcat.ucb.com",
password="",
port=443, https=TRUE,
extended_headers=.get_tk_headers()
)
.db_reconnect <- function(x){
xn <- deparse(substitute(x))
nv <- db_reconnect(x, extended_headers=.get_tk_headers())
assign(xn, nv, envir=parent.frame(n=1))
invisible(nv)
}
if("MetaBase" %in% list_MDBs(.tkcon)$name){
source("helpers/loadMBObjects_fromTKCat.R")
}else{
stop("MetaBase is not available in this TKCat instance.")
}
##
workingDirectory <- "../../../working"
##
opts_knit$set(root.dir=workingDirectory)
opts_chunk$set(
eval=TRUE,
message=FALSE,
root.dir=workingDirectory
)
## Specific config
bedInstance <- "UCB-Human"
bedVersion <- format(Sys.Date(), "%Y.%m.%d")
ensembl_release <- "111"
ensembl_Hsapiens <- list(
release=ensembl_release,
organism="Homo sapiens",
gv="38", # genome version
gdbCref=c( # Gene cross-references DBs
"HGNC"="HGNC",
"EntrezGene"="EntrezGene",
"Vega_gene"="Vega_gene",
"Ens_Hs_gene"="Ens_gene"
),
gdbAss=c( # Gene associated IDs (DB)
"miRBase"="miRBase",
"MIM_GENE"="MIM_GENE",
"UniGene"="UniGene"
),
tdbCref=c( # Transcript cross-references DBs
"RefSeq_mRNA"="RefSeq",
"RefSeq_ncRNA"="RefSeq",
"RefSeq_mRNA_predicted"="RefSeq",
"RefSeq_ncRNA_predicted"="RefSeq",
"Vega_transcript"="Vega_transcript",
"Ens_Hs_transcript"="Ens_transcript"
),
pdbCref=c( # Peptide cross-references DBs
"RefSeq_peptide"="RefSeq_peptide",
"RefSeq_peptide_predicted"="RefSeq_peptide",
"Uniprot/SPTREMBL"="Uniprot",
"Uniprot/SWISSPROT"="Uniprot",
"Vega_translation"="Vega_translation",
"Ens_Hs_translation"="Ens_translation"
),
canChromosomes=c(1:22, "X", "Y", "MT")
)
ensembl_Mmusculus <- list(
release=ensembl_release,
organism="Mus musculus",
gv="39", # genome version
gdbCref=c( # Gene cross-references DBs
"MGI"="MGI",
"EntrezGene"="EntrezGene",
"Vega_gene"="Vega_gene",
"Ens_Mm_gene"="Ens_gene"
),
gdbAss=c( # Gene associated IDs (DB)
"miRBase"="miRBase",
"UniGene"="UniGene"
),
tdbCref=c( # Transcript cross-references DBs
"RefSeq_mRNA"="RefSeq",
"RefSeq_ncRNA"="RefSeq",
"RefSeq_mRNA_predicted"="RefSeq",
"RefSeq_ncRNA_predicted"="RefSeq",
"Vega_transcript"="Vega_transcript",
"Ens_Mm_transcript"="Ens_transcript"
),
pdbCref=c( # Peptide cross-references DBs
"RefSeq_peptide"="RefSeq_peptide",
"RefSeq_peptide_predicted"="RefSeq_peptide",
"Uniprot/SPTREMBL"="Uniprot",
"Uniprot/SWISSPROT"="Uniprot",
"Vega_translation"="Vega_translation",
"Ens_Mm_translation"="Ens_translation"
),
canChromosomes=c(1:19, "X", "Y", "MT")
)
ensembl_Rnorvegicus <- list(
release=ensembl_release,
organism="Rattus norvegicus",
gv="72", # genome version
gdbCref=c( # Gene cross-references DBs
"RGD"="RGD",
"EntrezGene"="EntrezGene",
"Vega_gene"="Vega_gene",
"Ens_Rn_gene"="Ens_gene"
),
gdbAss=c( # Gene associated IDs (DB)
"miRBase"="miRBase",
"UniGene"="UniGene"
),
tdbCref=c( # Transcript cross-references DBs
"RefSeq_mRNA"="RefSeq",
"RefSeq_ncRNA"="RefSeq",
"RefSeq_mRNA_predicted"="RefSeq",
"RefSeq_ncRNA_predicted"="RefSeq",
"Vega_transcript"="Vega_transcript",
"Ens_Rn_transcript"="Ens_transcript"
),
pdbCref=c( # Peptide cross-references DBs
"RefSeq_peptide"="RefSeq_peptide",
"RefSeq_peptide_predicted"="RefSeq_peptide",
"Uniprot/SPTREMBL"="Uniprot",
"Uniprot/SWISSPROT"="Uniprot",
"Vega_translation"="Vega_translation",
"Ens_Rn_translation"="Ens_translation"
),
canChromosomes=c(1:20, "X", "Y", "MT")
)
ensembl_Sscrofa <- list(
release=ensembl_release,
organism="Sus scrofa",
gv="111", # genome version
gdbCref=c( # Gene cross-references DBs
"EntrezGene"="EntrezGene",
"Vega_gene"="Vega_gene",
"Ens_Ss_gene"="Ens_gene"
),
gdbAss=c( # Gene associated IDs (DB)
"miRBase"="miRBase",
"UniGene"="UniGene"
),
tdbCref=c( # Transcript cross-references DBs
"RefSeq_mRNA"="RefSeq",
"RefSeq_ncRNA"="RefSeq",
"RefSeq_mRNA_predicted"="RefSeq",
"RefSeq_ncRNA_predicted"="RefSeq",
"Vega_transcript"="Vega_transcript",
"Ens_Ss_transcript"="Ens_transcript"
),
pdbCref=c( # Peptide cross-references DBs
"RefSeq_peptide"="RefSeq_peptide",
"RefSeq_peptide_predicted"="RefSeq_peptide",
"Uniprot/SPTREMBL"="Uniprot",
"Uniprot/SWISSPROT"="Uniprot",
"Vega_translation"="Vega_translation",
"Ens_Ss_translation"="Ens_translation"
),
canChromosomes=c(1:18, "X", "Y", "MT")
)
ensembl_Drerio <- list(
release=ensembl_release,
organism="Danio rerio",
gv="11", # genome version
gdbCref=c( # Gene cross-references DBs
"EntrezGene"="EntrezGene",
"ZFIN_ID"="ZFIN_gene",
"Vega_gene"="Vega_gene",
"Ens_Dr_gene"="Ens_gene"
),
gdbAss=c( # Gene associated IDs (DB)
"miRBase"="miRBase",
"UniGene"="UniGene"
),
tdbCref=c( # Transcript cross-references DBs
"RefSeq_mRNA"="RefSeq",
"RefSeq_ncRNA"="RefSeq",
"RefSeq_mRNA_predicted"="RefSeq",
"RefSeq_ncRNA_predicted"="RefSeq",
"Vega_transcript"="Vega_transcript",
"Ens_Dr_transcript"="Ens_transcript"
),
pdbCref=c( # Peptide cross-references DBs
"RefSeq_peptide"="RefSeq_peptide",
"RefSeq_peptide_predicted"="RefSeq_peptide",
"Uniprot/SPTREMBL"="Uniprot",
"Uniprot/SWISSPROT"="Uniprot",
"Vega_translation"="Vega_translation",
"Ens_Dr_translation"="Ens_translation"
),
canChromosomes=c(1:25, "MT")
)
## General config
reDumpThr <- as.difftime(4, units="days")
curDate <- Sys.Date()
BED is based on Neo4j.
The S01-NewBED-Container.sh shows how to run it in a docker container.
Because the import functions use massively the LOAD CSV
Neo4j query,
the feeding of the BED database can only be down from the
computer hosting the Neo4j relevant instance.
The chunk below shows how to connect to BED. In this example, neo4j authentication is disabled.
connectToBed(
url="localhost:5410",
remember=FALSE,
useCache=TRUE,
importPath=file.path(getwd(), "neo4jImport")
)
## Warning in checkBedConn(verbose = TRUE): BED DB is empty !
## Warning in checkBedConn(): BED DB is empty !
## Warning in checkBedConn(): BED DB is empty !
## Warning in checkBedCache(newCon = TRUE): Clearing cache
## Warning in checkBedConn(verbose = FALSE): BED DB is empty !
Do not go further if your BED DB is not empty.
dbSize <- bedCall(cypher, 'MATCH (n) RETURN count(n)')[,1]
if(dbSize!=0){
stop("BED DB is not empty ==> clean it before loading the content below")
}
print(bedInstance)
## [1] "UCB-Human"
print(bedVersion)
## [1] "2024.01.14"
BED:::setBedVersion(bedInstance=bedInstance, bedVersion=bedVersion)
Start: 2024-01-14 11:47:01.757516
BED:::loadBedModel()
End: 2024-01-14 11:47:06.863333
Information is downloaded if older than 4 days
according to the reDumpThr
object.
Start: 2024-01-14 11:47:06.864832
BED:::loadNcbiTax(
reDumpThr=reDumpThr,
ddir=".",
orgOfInt=c(
"Homo sapiens", "Rattus norvegicus", "Mus musculus",
"Sus scrofa", "Danio rerio"
),
curDate=curDate
)
End: 2024-01-14 11:47:25.977368
BED:::registerBEDB(
name="Ens_gene",
description="Ensembl gene",
currentVersion=ensembl_release,
idURL='http://www.ensembl.org/id/%s'
)
BED:::registerBEDB(
name="Ens_transcript",
description="Ensembl transcript",
currentVersion=ensembl_release,
idURL='http://www.ensembl.org/id/%s'
)
BED:::registerBEDB(
name="Ens_translation",
description="Ensembl peptides",
currentVersion=ensembl_release,
idURL='http://www.ensembl.org/id/%s'
)
ensembl <- ensembl_Drerio
print(ensembl)
## $release
## [1] "111"
##
## $organism
## [1] "Danio rerio"
##
## $gv
## [1] "11"
##
## $gdbCref
## EntrezGene ZFIN_ID Vega_gene Ens_Dr_gene
## "EntrezGene" "ZFIN_gene" "Vega_gene" "Ens_gene"
##
## $gdbAss
## miRBase UniGene
## "miRBase" "UniGene"
##
## $tdbCref
## RefSeq_mRNA RefSeq_ncRNA RefSeq_mRNA_predicted
## "RefSeq" "RefSeq" "RefSeq"
## RefSeq_ncRNA_predicted Vega_transcript Ens_Dr_transcript
## "RefSeq" "Vega_transcript" "Ens_transcript"
##
## $pdbCref
## RefSeq_peptide RefSeq_peptide_predicted Uniprot/SPTREMBL
## "RefSeq_peptide" "RefSeq_peptide" "Uniprot"
## Uniprot/SWISSPROT Vega_translation Ens_Dr_translation
## "Uniprot" "Vega_translation" "Ens_translation"
##
## $canChromosomes
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "MT"
Start: 2024-01-14 11:47:26.386705
BED:::getEnsemblGeneIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$gdbCref,
dbAss=ensembl$gdbAss,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2606111 139.2 8062171 430.6 10077713 538.3
## Vcells 8713004 66.5 43868404 334.7 68544379 523.0
End: 2024-01-14 11:50:31.316136
Start: 2024-01-14 11:50:31.316714
BED:::getEnsemblTranscriptIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$tdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2609321 139.4 8062171 430.6 10077713 538.3
## Vcells 8720097 66.6 50677200 386.7 68544379 523.0
End: 2024-01-14 11:53:01.396618
Start: 2024-01-14 11:53:01.398333
BED:::getEnsemblPeptideIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$pdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612368 139.6 8062171 430.6 10077713 538.3
## Vcells 8726827 66.6 48714112 371.7 68544379 523.0
End: 2024-01-14 11:55:50.702204
ensembl <- ensembl_Hsapiens
print(ensembl)
## $release
## [1] "111"
##
## $organism
## [1] "Homo sapiens"
##
## $gv
## [1] "38"
##
## $gdbCref
## HGNC EntrezGene Vega_gene Ens_Hs_gene
## "HGNC" "EntrezGene" "Vega_gene" "Ens_gene"
##
## $gdbAss
## miRBase MIM_GENE UniGene
## "miRBase" "MIM_GENE" "UniGene"
##
## $tdbCref
## RefSeq_mRNA RefSeq_ncRNA RefSeq_mRNA_predicted
## "RefSeq" "RefSeq" "RefSeq"
## RefSeq_ncRNA_predicted Vega_transcript Ens_Hs_transcript
## "RefSeq" "Vega_transcript" "Ens_transcript"
##
## $pdbCref
## RefSeq_peptide RefSeq_peptide_predicted Uniprot/SPTREMBL
## "RefSeq_peptide" "RefSeq_peptide" "Uniprot"
## Uniprot/SWISSPROT Vega_translation Ens_Hs_translation
## "Uniprot" "Vega_translation" "Ens_translation"
##
## $canChromosomes
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "X" "Y" "MT"
Start: 2024-01-14 11:55:50.709869
BED:::getEnsemblGeneIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$gdbCref,
dbAss=ensembl$gdbAss,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612305 139.6 10934034 584.0 13667542 730.0
## Vcells 8726965 66.6 131222916 1001.2 164014750 1251.4
End: 2024-01-14 12:00:39.969077
Start: 2024-01-14 12:00:39.969501
BED:::getEnsemblTranscriptIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$tdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612771 139.6 10528672 562.3 13667542 730.0
## Vcells 8727763 66.6 126038000 961.6 164014750 1251.4
End: 2024-01-14 12:05:53.39851
Start: 2024-01-14 12:05:53.398962
BED:::getEnsemblPeptideIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$pdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612629 139.6 10139525 541.6 13667542 730.0
## Vcells 8727589 66.6 121060480 923.7 164014750 1251.4
End: 2024-01-14 12:12:55.22442
ensembl <- ensembl_Mmusculus
print(ensembl)
## $release
## [1] "111"
##
## $organism
## [1] "Mus musculus"
##
## $gv
## [1] "39"
##
## $gdbCref
## MGI EntrezGene Vega_gene Ens_Mm_gene
## "MGI" "EntrezGene" "Vega_gene" "Ens_gene"
##
## $gdbAss
## miRBase UniGene
## "miRBase" "UniGene"
##
## $tdbCref
## RefSeq_mRNA RefSeq_ncRNA RefSeq_mRNA_predicted
## "RefSeq" "RefSeq" "RefSeq"
## RefSeq_ncRNA_predicted Vega_transcript Ens_Mm_transcript
## "RefSeq" "Vega_transcript" "Ens_transcript"
##
## $pdbCref
## RefSeq_peptide RefSeq_peptide_predicted Uniprot/SPTREMBL
## "RefSeq_peptide" "RefSeq_peptide" "Uniprot"
## Uniprot/SWISSPROT Vega_translation Ens_Mm_translation
## "Uniprot" "Vega_translation" "Ens_translation"
##
## $canChromosomes
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "X" "Y" "MT"
Start: 2024-01-14 12:12:55.232118
BED:::getEnsemblGeneIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$gdbCref,
dbAss=ensembl$gdbAss,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612422 139.6 8477593 452.8 13667542 730.0
## Vcells 8727487 66.6 96848384 738.9 164014750 1251.4
End: 2024-01-14 12:16:01.442534
Start: 2024-01-14 12:16:01.44326
BED:::getEnsemblTranscriptIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$tdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612501 139.6 8265491 441.5 13667542 730.0
## Vcells 8727639 66.6 77478708 591.2 164014750 1251.4
End: 2024-01-14 12:19:30.662379
Start: 2024-01-14 12:19:30.662823
BED:::getEnsemblPeptideIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$pdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612590 139.6 8265491 441.5 13667542 730.0
## Vcells 8727850 66.6 74443560 568.0 164014750 1251.4
End: 2024-01-14 12:23:15.392183
ensembl <- ensembl_Rnorvegicus
print(ensembl)
## $release
## [1] "111"
##
## $organism
## [1] "Rattus norvegicus"
##
## $gv
## [1] "72"
##
## $gdbCref
## RGD EntrezGene Vega_gene Ens_Rn_gene
## "RGD" "EntrezGene" "Vega_gene" "Ens_gene"
##
## $gdbAss
## miRBase UniGene
## "miRBase" "UniGene"
##
## $tdbCref
## RefSeq_mRNA RefSeq_ncRNA RefSeq_mRNA_predicted
## "RefSeq" "RefSeq" "RefSeq"
## RefSeq_ncRNA_predicted Vega_transcript Ens_Rn_transcript
## "RefSeq" "Vega_transcript" "Ens_transcript"
##
## $pdbCref
## RefSeq_peptide RefSeq_peptide_predicted Uniprot/SPTREMBL
## "RefSeq_peptide" "RefSeq_peptide" "Uniprot"
## Uniprot/SWISSPROT Vega_translation Ens_Rn_translation
## "Uniprot" "Vega_translation" "Ens_translation"
##
## $canChromosomes
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "X" "Y" "MT"
Start: 2024-01-14 12:23:15.400541
BED:::getEnsemblGeneIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$gdbCref,
dbAss=ensembl$gdbAss,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612437 139.6 8265491 441.5 13667542 730.0
## Vcells 8727839 66.6 59554848 454.4 164014750 1251.4
End: 2024-01-14 12:25:32.767507
Start: 2024-01-14 12:25:32.767965
BED:::getEnsemblTranscriptIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$tdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612471 139.6 8265491 441.5 13667542 730.0
## Vcells 8727916 66.6 47643879 363.5 164014750 1251.4
End: 2024-01-14 12:27:05.667845
Start: 2024-01-14 12:27:05.668269
BED:::getEnsemblPeptideIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$pdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612413 139.6 8265491 441.5 13667542 730.0
## Vcells 8727882 66.6 38115104 290.8 164014750 1251.4
End: 2024-01-14 12:29:09.788619
ensembl <- ensembl_Sscrofa
print(ensembl)
## $release
## [1] "111"
##
## $organism
## [1] "Sus scrofa"
##
## $gv
## [1] "111"
##
## $gdbCref
## EntrezGene Vega_gene Ens_Ss_gene
## "EntrezGene" "Vega_gene" "Ens_gene"
##
## $gdbAss
## miRBase UniGene
## "miRBase" "UniGene"
##
## $tdbCref
## RefSeq_mRNA RefSeq_ncRNA RefSeq_mRNA_predicted
## "RefSeq" "RefSeq" "RefSeq"
## RefSeq_ncRNA_predicted Vega_transcript Ens_Ss_transcript
## "RefSeq" "Vega_transcript" "Ens_transcript"
##
## $pdbCref
## RefSeq_peptide RefSeq_peptide_predicted Uniprot/SPTREMBL
## "RefSeq_peptide" "RefSeq_peptide" "Uniprot"
## Uniprot/SWISSPROT Vega_translation Ens_Ss_translation
## "Uniprot" "Vega_translation" "Ens_translation"
##
## $canChromosomes
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "X" "Y" "MT"
Start: 2024-01-14 12:29:09.796041
BED:::getEnsemblGeneIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$gdbCref,
dbAss=ensembl$gdbAss,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612437 139.6 8265491 441.5 13667542 730.0
## Vcells 8728160 66.6 36654500 279.7 164014750 1251.4
End: 2024-01-14 12:30:00.176693
Start: 2024-01-14 12:30:00.177154
BED:::getEnsemblTranscriptIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$tdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612363 139.6 8265491 441.5 13667542 730.0
## Vcells 8728057 66.6 35252320 269.0 164014750 1251.4
End: 2024-01-14 12:31:08.373983
Start: 2024-01-14 12:31:08.374404
BED:::getEnsemblPeptideIds(
organism=ensembl$organism,
release=ensembl$release,
gv=ensembl$gv,
ddir=".",
dbCref=ensembl$pdbCref,
canChromosomes=ensembl$canChromosomes
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2612371 139.6 8265491 441.5 13667542 730.0
## Vcells 8728133 66.6 33906228 258.7 164014750 1251.4
End: 2024-01-14 12:33:23.738484
Information is downloaded if older than 4 days
according to the reDumpThr
object.
BED:::dumpNcbiDb(
taxOfInt = c(), reDumpThr=reDumpThr,
ddir=".",
toLoad=c(), curDate=curDate
)
BED:::registerBEDB(
name="EntrezGene",
description="NCBI gene",
currentVersion=format(dumpDate, "%Y%m%d"),
idURL='https://www.ncbi.nlm.nih.gov/gene/%s'
)
BED:::registerBEDB(
name="RefSeq",
description="NCBI nucleotide",
currentVersion=format(dumpDate, "%Y%m%d"),
idURL='https://www.ncbi.nlm.nih.gov/nuccore/%s'
)
BED:::registerBEDB(
name="RefSeq_peptide",
description="NCBI protein",
currentVersion=format(dumpDate, "%Y%m%d"),
idURL='https://www.ncbi.nlm.nih.gov/protein/%s'
)
Start: 2024-01-14 12:33:23.952307
BED:::getNcbiGeneTransPep(
organism="Danio rerio",
ddir=".",
curDate=curDate
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2618928 139.9 8265491 441.5 13667542 730.0
## Vcells 8742202 66.7 33942068 259.0 164014750 1251.4
End: 2024-01-14 12:36:09.404121
Start: 2024-01-14 12:36:09.404555
BED:::getNcbiGeneTransPep(
organism="Homo sapiens",
ddir=".",
curDate=curDate
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2619002 139.9 13489241 720.5 16861551 900.6
## Vcells 8742375 66.7 94990661 724.8 164014750 1251.4
End: 2024-01-14 12:44:13.060746
Start: 2024-01-14 12:44:13.06117
BED:::getNcbiGeneTransPep(
organism="Mus musculus",
ddir=".",
curDate=curDate
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2618965 139.9 10791393 576.4 16861551 900.6
## Vcells 8742363 66.7 75992529 579.8 164014750 1251.4
End: 2024-01-14 12:49:08.567864
Start: 2024-01-14 12:49:08.568286
BED:::getNcbiGeneTransPep(
organism="Rattus norvegicus",
ddir=".",
curDate=curDate
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2618991 139.9 8633115 461.1 16861551 900.6
## Vcells 8742456 66.7 60794024 463.9 164014750 1251.4
End: 2024-01-14 12:52:37.196582
Start: 2024-01-14 12:52:37.197022
BED:::getNcbiGeneTransPep(
organism="Sus scrofa",
ddir=".",
curDate=curDate
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2618939 139.9 8633115 461.1 16861551 900.6
## Vcells 8742419 66.7 38908176 296.9 164014750 1251.4
End: 2024-01-14 12:54:38.571459
Start: 2024-01-14 12:54:38.571967
message("Direct cross-references with Uniprot")
BED:::dumpNcbiDb(
taxOfInt="",
reDumpThr=Inf,
ddir=".",
toLoad="gene_refseq_uniprotkb_collab",
curDate=Sys.Date()
)
gene_refseq_uniprotkb_collab <- gene_refseq_uniprotkb_collab[
gene_refseq_uniprotkb_collab$method %in% c("uniprot", "identical"),
]
gene_refseq_uniprotkb_collab$NCBI_protein_accession <- sub(
"[.].*$", "", gene_refseq_uniprotkb_collab$NCBI_protein_accession
)
for(org in listOrganisms()){
message(" ", org)
curRS <- getBeIds(
be="Peptide", source="RefSeq_peptide", organism=org,
restricted=TRUE
)
toAdd <- gene_refseq_uniprotkb_collab[
which(
gene_refseq_uniprotkb_collab$NCBI_protein_accession %in% curRS$id
),
]
## External DB IDs
toImport <- unique(toAdd[, "UniProtKB_protein_accession", drop=F])
colnames(toImport) <- "id"
BED:::loadBE(
d=toImport, be="Peptide",
dbname="Uniprot",
taxId=NA
)
## The cross references
toImport <- toAdd[
, c("NCBI_protein_accession", "UniProtKB_protein_accession")
]
colnames(toImport) <- c("id1", "id2")
BED:::loadCorrespondsTo(
d=toImport,
db1="RefSeq_peptide",
db2="Uniprot",
be="Peptide"
)
}
End: 2024-01-14 13:26:02.061278
Release is defined according to the reldate.txt file on the Uniprot FTP and data is downloaded only if not already done for the current release.
ftp <- "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions"
avRel <- readLines(file.path(ftp, "reldate.txt"), n=1)
avRel <- sub(
"^UniProt Knowledgebase Release ", "",
sub(" consists of:$", "", avRel)
)
if(is.na(as.Date(paste0(avRel, "_01"), format="%Y_%m_%d"))){
print(avRel)
stop(sprintf("Check reldate.txt file on %s", ftp))
}
BED:::registerBEDB(
name="Uniprot",
description="Uniprot",
currentVersion=avRel,
idURL='http://www.uniprot.org/uniprot/%s'
)
Start: 2024-01-14 13:26:05.317015
BED:::getUniprot(
organism="Danio rerio", release=avRel, ddir="."
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226060423 12073.0 542148864 28953.9 542148864 28953.9
## Vcells 1255117686 9575.8 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 13:29:13.713659
Start: 2024-01-14 13:29:13.714174
BED:::getUniprot(
organism="Homo sapiens", release=avRel, ddir="."
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226060350 12073.0 542148864 28953.9 542148864 28953.9
## Vcells 1255117615 9575.8 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 13:34:04.992427
Start: 2024-01-14 13:34:04.993228
BED:::getUniprot(
organism="Mus musculus", release=avRel, ddir="."
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226060307 12073.0 542148864 28953.9 542148864 28953.9
## Vcells 1255117594 9575.8 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 13:36:43.992773
Start: 2024-01-14 13:36:43.993324
BED:::getUniprot(
organism="Rattus norvegicus", release=avRel, ddir="."
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226060282 12073.0 542148864 28953.9 542148864 28953.9
## Vcells 1255117603 9575.8 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 13:39:14.783914
Start: 2024-01-14 13:39:14.784573
BED:::getUniprot(
organism="Sus scrofa", release=avRel, ddir="."
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226060200 12073.0 542148864 28953.9 542148864 28953.9
## Vcells 1255117517 9575.8 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 13:53:43.462274
Start: 2024-01-14 13:53:43.462833
message("Indirect cross-references with Uniprot")
dumpDir <- "NCBI-gene-DATA"
f <- "gene2accession.gz"
if(file.exists(dumpDir)){
load(file.path(dumpDir, "dumpDate.rda"))
message("Last download: ", dumpDate)
if(curDate - dumpDate > reDumpThr | !file.exists(file.path(dumpDir, f))){
toDownload <- TRUE
}else{
toDownload <- FALSE
}
}else{
message("Not downloaded yet")
toDownload <- TRUE
}
if(toDownload){
ftp <- "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/"
dlok <- try(download.file(
url=paste0(ftp, f),
destfile=file.path(dumpDir, f),
method="wget",
quiet=T
), silent=T)
}else{
message("Existing data are going to be used")
}
cn <- readLines(file.path(dumpDir, f), n=1)
cn <- sub("^#", "", cn)
cn <- unlist(strsplit(cn, split="[ \t]"))
for(org in listOrganisms()){
message(" ", org)
tid <- getTaxId(org)
toAdd <- read.table(
text=system(
sprintf("zgrep ^%s %s", tid, file.path(dumpDir, f)),
intern=TRUE
),
sep="\t",
header=F,
stringsAsFactors=F,
quote="", comment.char=""
)
colnames(toAdd) <- cn
toAdd <- toAdd[
which(toAdd$tax_id==tid),
c("tax_id", "GeneID", "protein_accession.version")
]
toAdd$pacc <- sub("[.].*$", "", toAdd$protein_accession.version)
curUP <- getBeIdConvTable(
from="Gene", from.source="BEDTech_gene", organism=org,
to="Peptide", to.source="Uniprot",
restricted=TRUE
)
toAdd <- merge(
toAdd[,c("GeneID", "pacc")],
curUP[,c("from", "to")],
by.x="pacc", by.y="to",
all=FALSE
)
toAdd <- toAdd[,c("from", "GeneID")]
toAdd$from <- as.character(toAdd$from)
toAdd$GeneID <- as.character(toAdd$GeneID)
colnames(toAdd) <- c("id1", "id2")
BED:::loadIsAssociatedTo(
d=toAdd,
db1="BEDTech_gene", db2="EntrezGene",
be="Gene"
)
}
End: 2024-01-14 14:14:25.28842
Start: 2024-01-14 14:14:25.289829
.db_reconnect(.tkcon)
tkmb <- get_MDB(.tkcon, "MetaBase")
# mbInfo <- mbquery("select * from zzz_System")
BED:::registerBEDB(
name="MetaBase_gene",
description="Clarivate Analytics MetaBase",
currentVersion=tkmb$MetaBase_sourceDatabases$current,
idURL='https://portal.genego.com/cgi/entity_page.cgi?term=20&id=%s'
)
BED:::registerBEDB(
name="MetaBase_object",
description="Clarivate Analytics MetaBase",
currentVersion=tkmb$MetaBase_sourceDatabases$current,
idURL='https://portal.genego.com/cgi/entity_page.cgi?term=100&id=%s'
)
.db_reconnect(tkmb)
loadMBObjects_fromTKCat(
orgOfInt=c("Homo sapiens"),
tkmb=tkmb
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226200939 12080.5 542148864 28953.9 542148864 28953.9
## Vcells 1282928352 9788.0 2413975841 18417.2 2413843464 18416.2
.db_reconnect(tkmb)
loadMBObjects_fromTKCat(
orgOfInt=c("Mus musculus"),
tkmb=tkmb
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226193680 12080.1 542148864 28953.9 542148864 28953.9
## Vcells 1282386179 9783.9 2413975841 18417.2 2413843464 18416.2
.db_reconnect(tkmb)
loadMBObjects_fromTKCat(
orgOfInt=c("Rattus norvegicus"),
tkmb=tkmb
)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 226193681 12080.1 542148864 28953.9 542148864 28953.9
## Vcells 1282386219 9783.9 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 14:19:30.118291
Start: 2024-01-14 14:19:30.118892
library(biomaRt)
loadBmHomologs <- function(mart, org2){
#mattr <- listAttributes(mart)
toImport <- getBM(
mart=mart,
attributes=c(
"ensembl_gene_id",
paste0(
org2,
c("_homolog_ensembl_gene", "_homolog_orthology_confidence")
)
)
)
colnames(toImport) <- c("id1", "id2", "cs")
toImport <- unique(toImport[
which(toImport$id1 != "" & toImport$id2 != "" & toImport$cs==1),
c("id1", "id2")
])
BED:::loadIsHomologOf(
d=toImport,
db1="Ens_gene", db2="Ens_gene",
be="Gene"
)
}
#########################################
orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio")
marts <-listMarts()
bm <- "ENSEMBL_MART_ENSEMBL"
version <- ensembl_release
if(
grep(
sprintf(" %s$", version),
marts[which(marts$biomart==bm), "version"]
)==1
){
version <- NULL
}
for(i in 1:(length(orgOfInt)-1)){
#########################################
## The mart
org1 <- orgOfInt[i]
mart <- useEnsembl(
biomart=bm,
dataset=paste0(org1, "_gene_ensembl"),
version=version
)
for(j in (i+1):length(orgOfInt)){
loadBmHomologs(
mart=mart,
org2=orgOfInt[j]
)
}
}
End: 2024-01-14 14:23:56.647594
Start: 2024-01-14 14:23:56.648806
#####################################
gdbname <- "EntrezGene"
taxOfInt <- unlist(lapply(
c(
"Homo sapiens", "Mus musculus", "Rattus norvegicus",
"Sus scrofa", "Danio rerio"
),
getTaxId
))
for(i in 1:length(taxOfInt)){
BED:::dumpNcbiDb(
taxOfInt=taxOfInt[i],
reDumpThr=reDumpThr,
ddir=".",
toLoad=c("gene_orthologs"),
curDate=curDate
)
toImport <- gene_orthologs[
which(
gene_orthologs$tax_id %in% taxOfInt &
gene_orthologs$Other_tax_id %in% taxOfInt &
gene_orthologs$relationship == "Ortholog"
),
c("GeneID", "Other_GeneID")
]
if(nrow(toImport)>0){
colnames(toImport) <- c("id1", "id2")
toImport <- dplyr::mutate_all(toImport, as.character)
BED:::loadIsHomologOf(
d=toImport,
db1=gdbname, db2=gdbname,
be="Gene"
)
}
}
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 229414415 12252.1 542148864 28953.9 542148864 28953.9
## Vcells 1263917544 9643.0 2413975841 18417.2 2413843464 18416.2
End: 2024-01-14 14:26:10.311735
library(GEOquery)
dir.create("geo", showWarnings=FALSE)
Start: 2024-01-14 14:26:10.586352
## Import plateform
platName <- "GPL1708"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Entrez
d <- Table(gds)
toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "GENE")]
colnames(toImport) <- c("probeID", "id")
toImport$probeID <- as.character(toImport$probeID)
toImport$id <- as.character(toImport$id)
toImport <- toImport[which(!is.na(toImport$id)),]
toImport <- unique(toImport)
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
# ## Import mapping with UniGene
# toImport <- d[which(!is.na(d$SPOT_ID)), c("SPOT_ID", "UNIGENE_ID")]
# colnames(toImport) <- c("probeID", "id")
# toImport$probeID <- as.character(toImport$probeID)
# toImport$id <- as.character(toImport$id)
# toImport <- toImport[which(!is.na(toImport$id) & toImport$id!=""),]
# dbname <- "UniGene"
# ##
# BED:::loadProbes(
# d=toImport,
# be=be,
# platform=platName,
# dbname=dbname
# )
End: 2024-01-14 14:26:19.093677
Start: 2024-01-14 14:26:19.094476
## Import plateform
platName <- "GPL6480"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Entrez
d <- Table(gds)
toImport <- d[which(!is.na(d$ID)), c("ID", "GENE")]
colnames(toImport) <- c("probeID", "id")
toImport$probeID <- as.character(toImport$probeID)
toImport$id <- as.character(toImport$id)
toImport <- toImport[which(!is.na(toImport$id)),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
# ## Import mapping with UniGene
# toImport <- d[which(!is.na(d$ID)), c("ID", "UNIGENE_ID")]
# colnames(toImport) <- c("probeID", "id")
# toImport$probeID <- as.character(toImport$probeID)
# toImport$id <- as.character(toImport$id)
# toImport <- toImport[which(!is.na(toImport$id)),]
# dbname <- "UniGene"
# ##
# BED:::loadProbes(
# d=toImport,
# be=be,
# platform=platName,
# dbname=dbname
# )
End: 2024-01-14 14:26:28.678242
Start: 2024-01-14 14:26:28.679068
## Import plateform
platName <- "GPL570"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:26:45.047439
Start: 2024-01-14 14:26:45.048921
## Import plateform
platName <- "GPL571"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:26:53.111189
Start: 2024-01-14 14:26:53.112198
## Import plateform
platName <- "GPL13158"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:27:06.255659
Start: 2024-01-14 14:27:06.256824
## Import plateform
platName <- "GPL96"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:27:15.260589
Start: 2024-01-14 14:27:15.26153
## Import plateform
platName <- "GPL1261"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:27:30.303245
Start: 2024-01-14 14:27:30.304478
## Import plateform
platName <- "GPL1355"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- strsplit(
as.character(d$ENTREZ_GENE_ID),
split=" /// "
)
names(toImport) <- d$ID
toImport <- stack(toImport)
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:27:42.796712
Start: 2024-01-14 14:27:42.79785
## Import plateform
platName <- "GPL10558"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:28:00.285635
Start: 2024-01-14 14:28:00.286885
## Import plateform
platName <- "GPL6947"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:28:12.228395
Start: 2024-01-14 14:28:12.229644
## Import plateform
platName <- "GPL6885"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
d <- Table(gds)
# e <- getBeIds(
# be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE
# )
# sum(d$Entrez_Gene_ID %in% e$id) < sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id)
be <- "Transcript"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
toImport <- d[,c("RefSeq_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,1] <- sub("[.].*$", "", toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "RefSeq"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:28:20.738121
Start: 2024-01-14 14:28:20.739532
## Import plateform
platName <- "GPL6887"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
d <- Table(gds)
# e <- getBeIds(
# be="Gene", source="EntrezGene", organism="mouse", restricted=FALSE
# )
# sum(d$Entrez_Gene_ID %in% e$id) > sum(sub("[.].*$", "", d$RefSeq_ID) %in% f$id)
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
# toImport[,1] <- sub("[.].*$", "", toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:28:33.174355
Start: 2024-01-14 14:28:33.175525
## Import plateform
platName <- "GPL6101"
gds <- getGEO(platName, destdir="geo")
platDesc <- Meta(gds)$title
be <- "Gene"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping
d <- Table(gds)
toImport <- d[,c("Entrez_Gene_ID", "ID")]
toImport[,1] <- as.character(toImport[,1])
toImport[,2] <- as.character(toImport[,2])
colnames(toImport) <- c("id", "probeID")
toImport <- toImport[which(!is.na(toImport$id) & toImport$id != ""),]
dbname <- "EntrezGene"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
End: 2024-01-14 14:28:39.874118
Start: 2024-01-14 14:28:39.875085
library(biomaRt)
bm <- "ENSEMBL_MART_ENSEMBL"
marts <-listMarts()
version <- ensembl_release
if(grep(
sprintf(" %s$", version),
marts[which(marts$biomart==bm), "version"]
) == 1) {
version <- NULL
}
orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio")
for(org in orgOfInt){
message(org)
mart <- useEnsembl(
biomart = bm,
dataset = paste0(org, "_gene_ensembl"),
version = version
)
at <- listAttributes(mart) %>%
dplyr::filter(
stringr::str_detect(
description, stringr::regex("probe", ignore_case=TRUE)
)
) %>%
dplyr::filter(
!name %in%
c(
"affy_huex_1_0_st_v2",
"affy_moex_1_0_st_v1",
"affy_raex_1_0_st_v1"
)
)
for(i in 1:nrow(at)){
message(" ", i, "/", nrow(at), " platforms")
message(" ", Sys.time())
platName <- at$name[i]
platDesc <- paste(at$description[i], "(Ensembl BioMart mapping)")
be <- "Transcript"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Ens_transcript
toImport <- getBM(
mart=mart,
attributes=c(
"ensembl_transcript_id",
platName
)
) %>%
dplyr::as_tibble() %>%
magrittr::set_colnames(c("id", "probeID")) %>%
dplyr::filter(!is.na(probeID) & probeID!="" & !is.na(id) & id!="") %>%
dplyr::select(probeID, id) %>%
dplyr::distinct()
dbname <- "Ens_transcript"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
}
}
End: 2024-01-14 15:22:02.818444
otherIdURL <- list(
"HGNC"='http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=%s',
"miRBase"='http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s',
"Vega_gene"='http://vega.sanger.ac.uk/id/%s',
"UniGene"='https://www.ncbi.nlm.nih.gov/unigene?term=%s',
"Vega_transcript"='http://vega.sanger.ac.uk/id/%s',
"MGI"='http://www.informatics.jax.org/marker/MGI:%s',
"Vega_translation"='http://vega.sanger.ac.uk/id/%s',
"RGD"='https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=%s',
"MIM_GENE"='http://www.omim.org/entry/%s',
"ZFIN_gene"='http://zfin.org/%s'
)
for(db in names(otherIdURL)){
BED:::registerBEDB(
name=db,
idURL=otherIdURL[[db]]
)
}
Start: 2024-01-14 15:22:03.355202
BED:::loadLuceneIndexes()
End: 2024-01-14 15:22:03.590014
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.3.so; LAPACK version 3.8.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] GEOquery_2.70.0 Biobase_2.62.0 BiocGenerics_0.48.1
## [4] biomaRt_2.58.0 TKCat_1.1.3 DBI_1.2.0
## [7] ReDaMoR_0.7.2 magrittr_2.0.3 dplyr_1.1.4
## [10] BED_1.5.0 visNetwork_2.1.2 neo2R_2.4.2
## [13] knitr_1.45
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 jsonlite_1.8.8 rmarkdown_2.25
## [4] zlibbioc_1.48.0 vctrs_0.6.5 memoise_2.0.1
## [7] RCurl_1.98-1.13 askpass_1.2.0 base64enc_0.1-3
## [10] htmltools_0.5.7 progress_1.2.3 curl_5.2.0
## [13] sass_0.4.8 parallelly_1.36.0 bslib_0.6.1
## [16] htmlwidgets_1.6.4 cachem_1.0.8 uuid_1.1-1
## [19] ClickHouseHTTP_0.3.2 mime_0.12 lifecycle_1.0.4
## [22] pkgconfig_2.0.3 colourpicker_1.3.0 Matrix_1.6-5
## [25] R6_2.5.1 fastmap_1.1.1 GenomeInfoDbData_1.2.11
## [28] future_1.33.1 shiny_1.8.0 digest_0.6.34
## [31] AzureAuth_1.3.3 AnnotationDbi_1.64.1 S4Vectors_0.40.2
## [34] RSQLite_2.3.4 filelock_1.0.3 fansi_1.0.6
## [37] httr_1.4.7 compiler_4.3.0 bit64_4.0.5
## [40] withr_2.5.2 R.utils_2.12.3 openssl_2.1.1
## [43] rappdirs_0.3.3 tools_4.3.0 httpuv_1.6.13
## [46] jsonvalidate_1.3.2 R.oo_1.25.0 glue_1.7.0
## [49] promises_1.2.1 grid_4.3.0 getPass_0.2-4
## [52] generics_0.1.3 tzdb_0.4.0 R.methodsS3_1.8.2
## [55] tidyr_1.3.0 data.table_1.14.10 hms_1.1.3
## [58] xml2_1.3.6 utf8_1.2.4 XVector_0.42.0
## [61] pillar_1.9.0 markdown_1.12 stringr_1.5.1
## [64] limma_3.58.1 later_1.3.2 rintrojs_0.3.4
## [67] BiocFileCache_2.10.1 lattice_0.22-5 bit_4.0.5
## [70] tidyselect_1.2.0 jose_1.2.0 Biostrings_2.70.1
## [73] miniUI_0.1.1.1 V8_4.4.1 IRanges_2.36.0
## [76] stats4_4.3.0 xfun_0.41 shinydashboard_0.7.2
## [79] statmod_1.5.0 DT_0.31 stringi_1.8.3
## [82] yaml_2.3.8 evaluate_0.23 codetools_0.2-19
## [85] tibble_3.2.1 cli_3.6.2 arrow_14.0.0.2
## [88] xtable_1.8-4 jquerylib_0.1.4 roxygen2_7.3.0
## [91] Rcpp_1.0.12 GenomeInfoDb_1.38.5 globals_0.16.2
## [94] dbplyr_2.4.0 png_0.1-8 XML_3.99-0.16
## [97] parallel_4.3.0 ellipsis_0.3.2 readr_2.1.5
## [100] assertthat_0.2.1 blob_1.2.4 prettyunits_1.2.0
## [103] bitops_1.0-7 listenv_0.9.0 purrr_1.0.2
## [106] crayon_1.5.2 rlang_1.1.3 KEGGREST_1.42.0
## [109] shinyjs_2.1.0
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.