# This script retrieves genome data from the Ensembl database. Run
# `ensembl_species.R` first and keep its output files "species.csv" and
# "chromosomes.csv".
library(data.table)
library(DBI)
library(glue)
compara_table <- "ensembl_compara_110"
# This is the output table of this script:
genes <- data.table(
species = integer(),
gene = character(),
chromosome = integer(),
start_position = integer(),
end_position = integer()
)
species <- fread("species.csv")
chromosomes <- fread("chromosomes.csv")
rlog::log_info("Connecting to Ensembl database server")
db <- dbConnect(
RMariaDB::MariaDB(),
host = "ensembldb.ensembl.org",
port = 5306,
user = "anonymous"
)
rlog::log_info("Retrieving human genes")
human_species_id <- 9606
human_present_row_count <- genes[species == human_species_id, .N]
if (human_present_row_count > 0) {
rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
} else {
human_table <- species[id == human_species_id, table_name]
dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
human_chromosome_ids <- chromosomes[species == human_species_id, id]
human_genes <- db |>
dbGetQuery(glue_sql("
SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
as.data.table() |>
setnames(
c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
c("gene", "chromosome", "start_position", "end_position")
)
human_genes[, species := human_species_id]
genes <- rbind(genes, human_genes)
}
dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
for (species_id in species[id != human_species_id, id]) {
present_row_count <- genes[species == species_id, .N]
species_name <- species[id == species_id, name]
if (present_row_count > 0) {
rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
rlog::log_info(glue("Present rows: {present_row_count}"))
next
}
rlog::log_info(glue(
"Retrieving genes for species {species_id} ({species_name})"
))
table_name <- species[id == species_id, table_name]
chromosome_ids <- chromosomes[species == species_id, id]
species_genes <- db |>
dbGetQuery(glue_sql("
SELECT
human.stable_id AS gene,
species.seq_region_id AS chromosome,
species.seq_region_start AS start_position,
species.seq_region_end AS end_position
FROM
(
SELECT
homology_id,
stable_id,
seq_region_id,
seq_region_start,
seq_region_end
FROM {`table_name`}.gene
JOIN gene_member USING (stable_id)
JOIN homology_member USING (gene_member_id)
JOIN homology USING (homology_id)
WHERE seq_region_id IN ({chromosome_ids*})
AND homology.description IN (
'ortholog_one2one',
'ortholog_one2many',
'ortholog_many2many'
)
) AS species
JOIN (
SELECT
homology_id,
stable_id
FROM homology_member
JOIN gene_member USING (gene_member_id)
WHERE taxon_id = {human_species_id}
) AS human ON species.homology_id = human.homology_id;
", .con = db)) |>
as.data.table()
if (nrow(species_genes) == 0) {
rlog::log_info("No human homologs found")
}
species_genes[, species := species_id]
genes <- rbind(genes, species_genes)
fwrite(genes, "genes.csv")
}
dbDisconnect(db)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.