## Functions related to create EnsDbs by downloading and installing MySQL
## databases from Ensembl.
library(RCurl)
library(RMariaDB)
library(ensembldb)
options(timeout = 3600)
#' @description Get core database names from the specified folder.
#'
#' @param ftp_folder The ftp url to the per-species mysql folders.
#'
#' @author Johannes Rainer
#'
#' @noRd
listCoreDbsInFolder <- function(ftp_folder) {
if (missing(ftp_folder))
stop("Argument 'ftp_folder' missing!")
folders <- unlist(strsplit(getURL(ftp_folder,
dirlistonly = TRUE), split = "\n"))
folders <- folders[grep("_core_", folders)]
res <- lapply(folders, function(z) {
tmp <- unlist(strsplit(z, split = "_core_"))
c(folder = z, organism = tmp[1L], type = "core", version = tmp[2])
})
do.call(rbind, res)
}
#' @description Creates an EnsDb for the specified species by first downloading
#' the corresponding MySQL database from Ensembl, installing it and
#' subsequently creating the EnsDb database from it.
#'
#' @param ftp_folder The ftp url to the per-species mysql folders. If not
#' provided it will use the default Ensembl ftp:
#' \code{ftp://ftp.ensembl.org/pub/release-<ens_version>/mysql/}.
#'
#' @param ens_version The Ensembl version (version of the Ensembl Perl API).
#'
#' @param species The name of the species (e.g. "homo_sapiens").
#'
#' @param user The user name for the MySQL/MariaDB database (write access).
#'
#' @param host The host on which the MySQL/MariaDB database is running.
#'
#' @param pass The password for the MySQL/MariaDB database.
#'
#' @param port The port of the MySQL/MariaDB database.
#'
#' @param local_tmp Local directory that will be used to temporarily store the
#' downloaded MySQL/MariaDB database files.
#'
#' @param dropDb Whether the Ensembl core database should be deleted once the
#' EnsDb has been created.
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## For Ensemblgenomes:
#' ftp_folder <- "ftp://ftp.ensemblgenomes.org/pub/release-33/fungi/mysql/"
#' @noRd
createEnsDbForSpecies <- function(ftp_folder,
ens_version = 86, species, user, host, pass,
port = NULL, local_tmp = tempdir(),
sub_dir = "",
dropDb = TRUE) {
## if ftp_folder is missing use the default one:
base_url = "ftp://ftp.ensembl.org/pub"
## (1) Get all directories from Ensembl
if (missing(ftp_folder))
ftp_folder <- paste0(base_url, "/release-", ens_version, "/mysql/")
res <- listCoreDbsInFolder(ftp_folder)
if (nrow(res) == 0)
stop("No directories found!")
if (missing(species))
species <- res[, "organism"]
rownames(res) <- res[, "organism"]
## Check if we've got the species available
got_specs <- species %in% rownames(res)
if (!all(got_specs))
warning("No core database for species ",
paste0(species[!got_specs], collapse = ", "), " found.")
species <- species[got_specs]
res <- res[species, , drop = FALSE]
if (length(species) == 0)
stop("No database for any provided species found!")
## (2) Process each species
message("Going to process ", nrow(res), " species.")
for (i in 1:nrow(res)) {
message("Processing species: ", res[i, "organism"], " (", i, " of ",
nrow(res), ")")
processOneSpecies(ftp_folder = paste0(ftp_folder, res[i, "folder"]),
ens_version = ens_version,
species = species[i], user = user, host = host,
pass = pass, port = port, local_tmp = local_tmp,
dropDb = dropDb)
message("Done with species: ", res[i, "organism"], ", ",
nrow(res) - i, " left.")
}
}
#' @description This function performs the actual tasks of downloading the
#' database files, installing them, deleting the download, creating the
#' EnsDb and deleting the database.
#'
#' @details While the location of the downloaded temporary MySQL database file
#' can be specified, the final SQLite file as well as all intermediate files
#' will be placed in the current working directory.
#'
#' @param ftp_folder The folder on Ensembl's ftp server containing the mysql
#' database files. Has to be the full path to these files.
#'
#' @param ens_version The Ensembl version (version of the Ensembl Perl API).
#'
#' @param species The name of the species (e.g. "homo_sapiens").
#'
#' @param user The user name for the MySQL/MariaDB database (write access).
#'
#' @param host The host on which the MySQL/MariaDB database is running.
#'
#' @param pass The password for the MySQL/MariaDB database.
#'
#' @param port The port of the MySQL/MariaDB database.
#'
#' @param local_tmp Local directory that will be used to temporarily store the
#' downloaded MySQL/MariaDB database files.
#'
#' @param dropDb Whether the Ensembl core database should be deleted once the
#' EnsDb has been created.
#'
#' @author Johannes Rainer
#'
#' @noRd
processOneSpecies <- function(ftp_folder, ens_version = 86, species, user,
host = "localhost",
pass, port = NULL, local_tmp = tempdir(),
dropDb = TRUE) {
if (missing(ftp_folder))
stop("'ftp_folder' has to be specified!")
if (missing(user))
stop("'user' has to be specified!")
if (missing(species))
stop("'species' has to be specified!")
## (1) Download database files.
res <- downloadFilesFromFtpFolder(url = ftp_folder, dest = local_tmp)
## (2) Install database.
db_name <- basename(ftp_folder)
res <- installEnsemblDb(dir = local_tmp, host = host, dbname = db_name,
user = user, pass = pass, port = port)
## (3) Delete the downloads.
fls <- dir(local_tmp, full.names = TRUE)
res <- sapply(fls, unlink)
## (4) Create the EnsDb (requires the correct Ensembl API)
## They are created in the local directory.
fetchTablesFromEnsembl(version = ens_version, species = species,
user = user, host = host, pass = pass, port = port)
DBFile <- makeEnsemblSQLiteFromTables()
unlink("*.txt")
## (5) Delete the database.
if (dropDb) {
if (length(port))
con <- dbConnect(MariaDB(), host = host, user = user, pass = pass,
port = port, dbname = "mysql")
else
con <- dbConnect(MariaDB(), host = host, user = user, pass = pass,
dbname = "mysql")
dbSendStatement(con, paste("drop database ", db_name))
dbDisconnect(con)
}
}
#'
#' @description Download all files from an ftp directory to a local directory.
#'
#' @param url A character string specifying the url of the directory.
#'
#' @param dest A character string specifying the local directory.
#'
#' @return A character string with the path of the local directory.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' ftp_dir <- "ftp://ftp.ensembl.org/pub/release-88/mysql/homo_sapiens_core_88_38"
#' local_dir <- downloadFilesFromFtpFolder(ftp_dir)
downloadFilesFromFtpFolder <- function(url, dest = tempdir()) {
fls <- getURL(paste0(url, "/"), dirlistonly = TRUE)
fls <- unlist(strsplit(fls, split = "\n"))
message("Downloading ", length(fls), " files ... ", appendLF = FALSE)
for (i in 1:length(fls)) {
download.file(url = paste0(url, "/", fls[i]),
destfile = paste0(dest, "/", fls[i]), quiet = TRUE)
}
message("OK")
return(dest)
}
#' @description Install an Ensembl MySQL database downloaded from the Ensembl
#' ftp server (e.g. using \link{downloadFilesFromFtpFolder}).
#'
#' @note The local directory is expected to correspond to the name of the
#' database, i.e. \code{basename(dir)} will be used as the database name if
#' argument \code{dbname} is missing.
#'
#' @param dir The path to the local directory where the database files are.
#'
#' @param host The host running the MySQL database.
#'
#' @param dbname The name of the database. If not provided the name of the
#' provided directory will be used instead.
#'
#' @param user The user name for the MySQL database (rw access).
#'
#' @param pass The password for the MySQL database.
#'
#' @param port The port of the MySQL database.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#' user <- "user"
#' pass <- "pass"
#' dbname <- "homo_sapiens_core_88_38"
#' ## set to directory returned by the downloadFilesFromFtpFolder
#' dir <- local_dir
#'
#' installEnsemblDb(dir = dir, dbname = dbname, user = user, pass = pass)
installEnsemblDb <- function(dir, host = "localhost", dbname, user, pass,
port = NULL) {
if (missing(dir))
stop("Argument 'dir' missing!")
if (missing(dbname))
dbname <- basename(dir)
if (missing(user))
stop("Argument 'user' missing!")
## Eventually unzip the files...
tmp <- system(paste0("gunzip ", dir, "/*.gz"))
## Create the database
if (length(port))
con <- dbConnect(MariaDB(), host = host, user = user, pass = pass,
port = port, dbname = "mysql")
else
con <- dbConnect(MariaDB(), host = host, user = user, pass = pass,
dbname = "mysql")
dbSendStatement(con, paste0("create database ", dbname))
dbDisconnect(con)
## Now create the tables and stuff.
if (length(port))
port <- paste0(" -P ", port)
tmp <- system(paste0("mysql -h ", host, " -u ", user, " --password=", pass,
port, " ", dbname, " < ", dir, "/", dbname,
".sql"))
## Importing the data.
cmd <- paste0("mysqlimport -h ", host, " -u ", user,
" --password=", pass, port,
" ", dbname, " -L ", dir, "/*.txt")
tmp <- system(cmd)
}
#' @description Creates EnsDb packages from all sqlite database files found in
#' the directory specified with parameter \code{dir}.
#' @param dir The path to the directory where the SQLite files can be found.
#' @param author The author of the package.
#' @param maintainer The maintainer of the package.
#' @param version The version of the package.
#' @noRd
createPackagesFromSQLite <- function(dir = ".", author, maintainer, version) {
if (missing(author) | missing(maintainer) | missing(version))
stop("Parameter 'author', 'maintainer' and 'version' are required!")
edbs <- dir(dir, full.names = TRUE, pattern = ".sqlite")
if (length(edbs) == 0)
stop("Found no SQLite database files in the specified directory!")
message("Processing ", length(edbs), " packages.")
for (i in 1:length(edbs)) {
message("Processing ", basename(edbs[i]), " (", i, " of ",
length(edbs), ")", appendLF = FALSE)
makeEnsembldbPackage(ensdb = edbs[i], version = version,
author = author, maintainer = maintainer)
message("OK")
}
}
## ftpf <- paste0("ftp://ftp.ensembl.org/pub/release-86/mysql/",
## "anas_platyrhynchos_core_86_1")
## local_dir <- tempdir()
## downloadFilesFromFtpFolder(ftpf, dest = local_dir)
## installEnsemblDb(dir = local_dir, host = "localhost", user = "jo",
## pass = "jo123", dbname = "anas_platyrhynchos_core_86_1")
## fls <- dir(local_dir, full.names = TRUE)
## res <- sapply(fls, unlink)
## fetchTablesFromEnsembl(86, species = "anas_platyrhynchos", user = "jo",
## host = "localhost", pass = "jo123", port = 3306)
## DBFile <- makeEnsemblSQLiteFromTables()
## unlink("*.txt")
## system.time(fetchTablesFromEnsembl(86, species = "anas_platyrhynchos"))
## ftpf <- paste0("ftp://ftp.ensembl.org/pub/release-86/mysql/",
## "homo_sapiens_core_86_38")
## local_dir <- tempdir()
## processOneSpecies(ftp_folder = ftpf, version = 86,
## species = "homo_sapiens", user = "jo",
## host = "localhost",
## pass = "jo123", port = 3306, local_tmp = local_dir,
## dropDb = FALSE)
## Add an issue:
## + Fix problem of non-defined sequence type "chromosome" in anas platyrhynchos
## database. -> update to the perl script.
## + Compare Hsapiens EnsDb created with new script and the "original" one.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.