# This script does post processing on the data from Ensembl and imports it into
# the R package. Run this script after `ensembl_species.R` and
# `ensembl_species.R`.
library(data.table)
species <- fread("species.csv")
chromosomes <- fread("chromosomes.csv")
genes <- fread("genes.csv")
species_metadata <- chromosomes[,
.(
n_chromosomes = .N,
median_chromosome_length = as.double(stats::median(length))
),
by = species
]
species <- merge(
species,
species_metadata,
by.x = "id",
by.y = "species",
sort = FALSE
)
# Remove duplicated genes within species.
genes <- genes[!duplicated(genes, by = c("species", "gene"))]
genes_chromosomes <- merge(
genes,
chromosomes,
by.x = c("species", "chromosome"),
by.y = c("species", "id"),
sort = FALSE
)
genes_chromosomes[, distance := ifelse(
start_position < length - end_position,
start_position,
length - end_position
)]
distances <- genes_chromosomes[, .(
species,
gene,
chromosome,
start_position,
end_position,
distance
)]
# This table will hold information on human genes.
genes <- genes_chromosomes[
species == 9606,
.(
id = gene,
chromosome = name
)
]
genes[, name := gprofiler2::gconvert(
id,
target = "HGNC",
mthreshold = 1,
filter_na = FALSE
)$target]
# Previous versions of geposan used different species IDs. For backwards
# compatibility, convert integer IDs to character.
species[, id := as.character(id)]
distances[, species := as.character(species)]
usethis::use_data(species, overwrite = TRUE)
usethis::use_data(genes, overwrite = TRUE)
usethis::use_data(distances, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.