R/dbhelpers.R

Defines functions .logOp2SQL listEnsDbs .createEnsDbIndices .mysql_datatype feedEnsDb2MySQL dbHasValidTables dbHasRequiredTables dbSchemaVersion .ensdb_protein_tables .ensdb_tables .getWhat removePrefix .buildQuery addRequiredTables joinQueryOnColumns2 joinQueryOnTables2 joinTwoTables prefixColumnsKeepOrder prefixColumns .getMetaDataValue EnsDb

Documented in EnsDb listEnsDbs

############################################################
## EnsDb
## Constructor function.
#' @title Connect to an EnsDb object
#'
#' @description The \code{EnsDb} constructor function connects to the database
#'     specified with argument \code{x} and returns a corresponding
#'     \code{\linkS4class{EnsDb}} object.
#'
#' @details
#'
#' By providing the connection to a MariaDB/MySQL database, it is possible
#' to use MariaDB/MySQL as the database backend and queries will be performed on
#' that database. Note however that this requires the package \code{RMariaDB}
#' to be installed. In addition, the user needs to have access to a MySQL
#' server providing already an EnsDb database, or must have write
#' privileges on a MySQL server, in which case the \code{\link{useMySQL}}
#' method can be used to insert the annotations from an EnsDB package into
#' a MySQL database.
#'
#' @param x Either a character specifying the \emph{SQLite} database file, or
#'     a \code{DBIConnection} to e.g. a MariaDB/MySQL database.
#'
#' @return A \code{\linkS4class{EnsDb}} object.
#'
#' @author Johannes Rainer
#'
#' @examples
#' ## "Standard" way to create an EnsDb object:
#' library(EnsDb.Hsapiens.v86)
#' EnsDb.Hsapiens.v86
#'
#' ## Alternatively, provide the full file name of a SQLite database file
#' dbfile <- system.file("extdata/EnsDb.Hsapiens.v86.sqlite", package = "EnsDb.Hsapiens.v86")
#' edb <- EnsDb(dbfile)
#' edb
#'
#' ## Third way: connect to a MySQL database
#' \dontrun{
#' library(RMariaDB)
#' dbcon <- dbConnect(MySQL(), user = my_user, pass = my_pass,
#'     host = my_host, dbname = "ensdb_hsapiens_v86")
#' edb <- EnsDb(dbcon)
#' }
EnsDb <- function(x){
    options(useFancyQuotes=FALSE)
    if(missing(x)){
        stop("No sqlite file provided!")
    }
    if (is.character(x)) {
        lite <- dbDriver("SQLite")
        con <- dbConnect(lite, dbname = x, flags=SQLITE_RO)
    }
    else if (is(x, "DBIConnection")) {
        con <- x
    } else {
        stop("'x' should be either a character specifying the SQLite file to",
             " be loaded, or a DBIConnection object providing the connection",
             " to the database.")
    }
    ## Check if the database is valid.
    OK <- dbHasRequiredTables(con)
    if (is.character(OK))
        stop(OK)
    OK <- dbHasValidTables(con)
    if (is.character(OK))
        stop(OK)
    tables <- dbListTables(con)
    ## read the columns for these tables.
    Tables <- vector(length=length(tables), "list")
    for (i in 1:length(Tables)) {
        Tables[[i]] <- colnames(dbGetQuery(con,
                                           paste0("select * from ",
                                                  tables[ i ], " limit 1")))
    }
    names(Tables) <- tables
    EDB <- new("EnsDb", ensdb = con, tables = Tables)
    EDB <- setProperty(EDB, dbSeqlevelsStyle="Ensembl")
    ## Add the db schema version to the properties.
    EDB <- setProperty(EDB, DBSCHEMAVERSION =
                                .getMetaDataValue(con, "DBSCHEMAVERSION"))
    ## Setting the default for the returnFilterColumns
    returnFilterColumns(EDB) <- TRUE
    ## Defining the default for the ordering
    orderResultsInR(EDB) <- FALSE
    ## Check it again...
    OK <- validateEnsDb(EDB)
    if (is.character(OK))
        stop(OK)
    EDB
}

## x is the connection to the database, name is the name of the entry to fetch
.getMetaDataValue <- function(x, name){
    return(dbGetQuery(x, paste0("select value from metadata where name='",
                                name, "'"))[ 1, 1])
}

############################################################
## prefixColumns
##
## Determines which tables (along with the table attributes) are required for
## the join.
## Updated version of prefixColumns:
## o Uses the order of the tables returned by listTables and adds the first
##   table in which the column was found. That's different to the previous
##   default of trying to join as few tables as possible but avoids problems
##   with table joins between e.g. tx and protein in which not all tx_id are
##   present in the protein table.
prefixColumns <- function(x, columns, clean = TRUE, with.tables){
    if (missing(columns))
        stop("columns is empty! No columns provided!")
    ## first get to the tables that contain these columns
    Tab <- listTables(x)   ## returns the tables by degree!
    if (!missing(with.tables)) {
        with.tables <- with.tables[ with.tables %in% names(Tab) ]
        if (length(with.tables) > 0) {
            Tab <- Tab[ with.tables ]
        } else {
            warning("The submitted table names are not valid in the database",
                    " and were thus dropped.")
        }
        if (length(Tab) == 0)
            stop("None of the tables submitted with with.tables is present",
                 " in the database!")
    }
    if (clean)
        columns <- cleanColumns(x, columns)
    if (length(columns) == 0) {
        return(NULL)
    }
    getCols <- columns
    result <- lapply(Tab, function(z) {
        if (length(getCols) > 0) {
            gotIt <- z[z %in% getCols]
            if (length(gotIt) > 0) {
                getCols <<- getCols[!(getCols %in% gotIt)]
                return(gotIt)
            } else {
                return(character())
            }
        }
    })
    ## If getCols length > 0 it contains columns not present in the db.
    result <- result[lengths(result) > 0]
    if (length(result) == 0)
        stop("None of the columns could be found in the database!")
    result <- mapply(result, names(result), FUN = function(z, y) {
        paste0(y, ".", z)
    }, SIMPLIFY = FALSE)
    return(result)
}

############################################################
## call the prefixColumns function and return just the column
## names, but in the same order than the provided columns.
prefixColumnsKeepOrder <- function(x, columns, clean = TRUE, with.tables) {
    res <- unlist(prefixColumns(x, columns, clean, with.tables),
                  use.names = FALSE)
    res_order <- sapply(columns, function(z) {
        idx <- grep(res, pattern = paste0("\\.", z, "$"))
        if (length(idx) == 0)
            return(NULL)
        return(res[idx[1]])
    })
    return(res_order[!is.null(res_order)])
}

############################################################
## ** NEW JOIN ENGINE **
##
## 1: table 1
## 2: table 2
## 3: on
## 4: suggested join
.JOINS2 <- rbind(
    c("gene", "tx", "on (gene.gene_id=tx.gene_id)", "join"),
    c("gene", "chromosome", "on (gene.seq_name=chromosome.seq_name)", "join"),
    c("tx", "tx2exon", "on (tx.tx_id=tx2exon.tx_id)", "join"),
    c("tx2exon", "exon", "on (tx2exon.exon_id=exon.exon_id)", "join"),
    c("tx", "protein", "on (tx.tx_id=protein.tx_id)", "left outer join"),
    c("gene", "entrezgene", "on (gene.gene_id=entrezgene.gene_id)",
      "left outer join"),
    c("protein", "protein_domain",
      "on (protein.protein_id=protein_domain.protein_id)", "left outer join"),
    c("protein", "uniprot", "on (protein.protein_id=uniprot.protein_id)",
      "left outer join"),
    c("uniprot", "protein_domain",
      "on (uniprot.protein_id=protein_domain.protein_id)", "left outer join")
)
## Takes the names of two tables, determines how to join them and returns the
## join query row, if found.
joinTwoTables <- function(a, b, mysql = FALSE) {
    jns <- .JOINS2
    gotIt <- which((jns[, 1] %in% a & jns[, 2] %in% b) |
                   (jns[, 2] %in% a & jns[, 1] %in% b))
    if (length(gotIt))
        jns[gotIt[1], ]
    else
        stop("Table(s) ", paste(a, collapse = ", "), " can not be joined with ",
             paste(b, collapse = ", "), "!")
}
## x: EnsDb.
## tab: tables to join.
## join: which type of join should be used?
## startWith: optional table name from which the join should start. That's
## specifically important for a left outer join call.
joinQueryOnTables2 <- function(x, tab, join = "suggested", startWith = NULL) {
    ## join can be join, left join, left outer join or suggested in which case
    ## the join defined in the .JOINS2 table will be used.
    join <- match.arg(join, c("join", "left join", "left outer join",
                              "suggested"))
    ## Order the tables.
    ## Start with startWith, or with the first one.
    if (missing(tab))
        stop("Argument 'tab' missing! Need some tables to make a join!")
    if (!is.null(startWith)) {
        if (!any(tab == startWith))
            stop("If provided, 'startWith' has to be the name of one of the",
                 " tables that should be joined!")
    }
    ## Add eventually needed tables to link the ones provided. The tables will
    ## be ordered by degree.
    tab <- addRequiredTables(x, tab)
    if (!is.null(startWith)) {
        alreadyUsed <- startWith
        tab <- tab[tab != startWith]
    } else {
        alreadyUsed <- tab[1]
        tab <- tab[-1]
    }
    Query <- alreadyUsed
    mysql <- is(dbconn(x), "MariaDBConnection")
    ## Iteratively build the query.
    while (length(tab) > 0) {
        res <- joinTwoTables(a = alreadyUsed, b = tab, mysql = mysql)
        newTab <- res[1:2][!(res[1:2] %in% alreadyUsed)]
        ## Could also use the suggested join which is in element 4.
        Query <- paste(Query, ifelse(join == "suggested", res[4], join),
                       newTab, res[3])
        alreadyUsed <- c(alreadyUsed, newTab)
        tab <- tab[tab != newTab]
    }
    return(Query)
}
## this function has to first get all tables that contain the columns,
## and then select, for columns present in more than one
## x... EnsDb
## columns... the columns
## NOTE: if "startWith" is not NULL, we're adding it to the tables!!!!
joinQueryOnColumns2 <- function(x, columns, join = "suggested",
                                startWith = NULL) {
    columns.bytable <- prefixColumns(x, columns)
    ## based on that we can build the query based on the tables we've got.
    ## Note that the function internally
    ## adds tables that might be needed for the join.
    Query <- joinQueryOnTables2(x, tab = c(names(columns.bytable), startWith),
                                join = join,
                                startWith = startWith)
    return(Query)
}


###
## Add additional tables in case the submitted tables are not directly connected
## and can thus not be joined. That's however not so complicated, since the database
## layout is pretty simple.
addRequiredTables <- function(x, tab){
    ## dash it, as long as I can't find a way to get connected objects in a
    ## graph I'll do it manually...
    ## if we have exon and any other table, we need definitely tx2exon
    if(any(tab=="exon") & length(tab) > 1){
        tab <- unique(c(tab, "tx2exon"))
    }
    ## if we have chromosome and any other table, we'll need gene
    if(any(tab=="chromosome") & length(tab) > 1){
        tab <- unique(c(tab, "gene"))
    }
    ## if we have exon and we have gene, we'll need also tx
    if((any(tab=="exon") | (any(tab=="tx2exon"))) & any(tab=="gene")){
        tab <- unique(c(tab, "tx"))
    }
    if (hasProteinData(x)) {
        ## Resolve the proteins: need tx to map between proteome and genome
        if (any(tab %in% c("uniprot", "protein_domain", "protein")) &
            any(tab %in% c("exon", "tx2exon", "gene",
                           "chromosome", "entrezgene")))
            tab <- unique(c(tab, "tx"))
        ## Need protein.
        if (any(tab %in% c("uniprot", "protein_domain")) &
            any(tab %in% c("exon", "tx2exon", "tx", "gene", "chromosome",
                           "entrezgene")))
            tab <- unique(c(tab, "protein"))
    }
    ## entrezgene is only linked via gene
    if (any(tab == "entrezgene") & length(tab) > 1)
        tab <- unique(c(tab, "gene"))
    return(tablesByDegree(x, tab))
}


############################################################
## .buildQuery
##
## The "backbone" function that builds the SQL query based on the specified
## columns, the provided filters etc.
## x an EnsDb object
## startWith: optional table from which the join should start.
.buildQuery <- function(x, columns, filter = AnnotationFilterList(),
                        order.by = "", order.type = "asc", group.by,
                        skip.order.check=FALSE, return.all.columns = TRUE,
                        join = "suggested", startWith = NULL) {
    resultcolumns <- columns    ## just to remember what we really want to give back
    ## 1) get all column names from the filters also removing the prefix.
    if (!is(filter, "AnnotationFilterList"))
        stop("parameter 'filter' has to be an 'AnnotationFilterList'!")
    if (length(filter) > 0) {
        ## check filter!
        ## add the columns needed for the filter
        filtercolumns <- unlist(lapply(filter, ensDbColumn, x))
        ## remove the prefix (column name for these)
        filtercolumns <- removePrefix(filtercolumns)
        columns <- unique(c(columns, filtercolumns))
    }
    ## 2) get all column names for the order.by:
    if (any(order.by != "")) {
        ## if we have skip.order.check set we use the order.by as is.
        if (!skip.order.check) {
            order.by <- checkOrderBy(orderBy = order.by, supported = columns)
        }
    } else {
        order.by <- ""
    }
    ## Note: order by is now a vector!!!
    ## columns are now all columns that we want to fetch or that we need to
    ## filter or to sort.
    ## 3) check which tables we need for all of these columns:
    need.tables <- names(prefixColumns(x, columns))
    ##
    ## Now we can begin to build the query parts!
    ## a) the query part that joins all required tables.
    joinquery <- joinQueryOnColumns2(x, columns=columns, join = join,
                                     startWith = startWith)
    ## b) the filter part of the query
    if (length(filter) > 0) {
        filterquery <- paste0(" where ", ensDbQuery(filter, x,
                                                    with.tables = need.tables))
    } else {
        filterquery <- ""
    }
    ## c) the order part of the query
    if (any(order.by != "")) {
        if (!skip.order.check) {
            order.by <- paste(prefixColumnsKeepOrder(x = x, columns = order.by,
                                                     with.tables = need.tables),
                              collapse=",")
        }
        orderquery <- paste(" order by", order.by, order.type)
    } else {
        orderquery <- ""
    }
    ## And finally build the final query
    if (return.all.columns) {
        resultcolumns <- columns
    }
    paste0("select distinct ",
           paste(prefixColumnsKeepOrder(x, resultcolumns,
                                        with.tables = need.tables),
                 collapse=","),
           " from ",
           joinquery,
           filterquery,
           orderquery
           )
}

## remove the prefix again...
removePrefix <- function(x, split=".", fixed = TRUE) {
    vapply(x, function(z) {
        tmp <- unlist(strsplit(z, split=split, fixed=fixed))
        tmp[length(tmp)]
    }, character(1), USE.NAMES = FALSE)
}

## just to add another layer; basically just calls buildQuery and executes the
## query
## join: what type of join should be performed.
## startWith: the name of the table from which the query should be started.
.getWhat <- function(x, columns, filter = AnnotationFilterList(), order.by = "",
                     order.type = "asc", group.by = NULL,
                     skip.order.check = FALSE, join = "suggested",
                     startWith = NULL) {
    ## That's nasty stuff; for now we support the column tx_name, which we however
    ## don't have in the database. Thus, we are querying everything except that
    ## column and filling it later with the values from tx_id.
    fetchColumns <- columns
    if(any(columns == "tx_name"))
        fetchColumns <- unique(
            c("tx_id", fetchColumns[fetchColumns != "tx_name"]))
    if (!is(filter, "AnnotationFilterList"))
        stop("parameter 'filter' has to be an 'AnnotationFilterList'!")
    ## Add also the global filter if present.
    global_filter <- .activeFilter(x)
    if (is(global_filter, "AnnotationFilter") |
        is(global_filter, "AnnotationFilterList"))
        filter <- AnnotationFilterList(global_filter, filter)
    ## If any filter is a SymbolFilter, add "symbol" to the return columns.
    if (length(filter) > 0) {
        if (any(.anyIs(filter, "SymbolFilter")))
            columns <- unique(c(columns, "symbol"))  ## append a filter column.
    }
    ## Catch also a "symbol" in columns
    if(any(columns == "symbol"))
        fetchColumns <- unique(c(fetchColumns[fetchColumns != "symbol"],
                                 "gene_name"))
    ## Shall we do the ordering in R or in SQL?
    if (orderResultsInR(x) & !skip.order.check) {
        ## Build the query
        Q <- .buildQuery(x = x, columns = fetchColumns, filter = filter,
                         order.by = "", order.type = order.type,
                         group.by = group.by,
                         skip.order.check = skip.order.check, join = join,
                         startWith = startWith)
        ## Get the data
        ## message("Query: ", Q, "\n")
        Res <- dbGetQuery(dbconn(x), Q)
        ## Note: we can only order by the columns that we did get back from the
        ## database; that might be different for the SQL sorting!
        Res <- orderDataFrameBy(Res, by = checkOrderBy(order.by, fetchColumns),
                                decreasing = order.type != "asc")
    } else {
        ## Build the query
        Q <- .buildQuery(x = x, columns = fetchColumns, filter = filter,
                         order.by = order.by, order.type = order.type,
                         group.by = group.by,
                         skip.order.check = skip.order.check, join = join,
                         startWith = startWith)
        ## Get the data
        ## message("Query: ", Q, "\n")
        Res <- dbGetQuery(dbconn(x), Q)
    }
    if(any(columns == "tx_cds_seq_start")) {
        if (!is.integer(Res[, "tx_cds_seq_start"])) {
            suppressWarnings(
                ## column contains "NULL" if not defined and coordinates are
                ## characters as.numeric transforms "NULL" into NA, and ensures
                ## coords are numeric.
                Res[ , "tx_cds_seq_start"] <- as.integer(Res[ , "tx_cds_seq_start"])
            )
        }
    }
    if(any(columns=="tx_cds_seq_end")){
        if (!is.integer(Res[, "tx_cds_seq_end"])) {
            suppressWarnings(
                ## column contains "NULL" if not defined and coordinates are
                ## characters as.numeric transforms "NULL" into NA, and ensures
                ## coords are numeric.
                Res[ , "tx_cds_seq_end" ] <- as.integer(Res[ , "tx_cds_seq_end"])
            )
        }
    }
    ## Fix for MySQL returning 'numeric' instead of 'integer'.
    ## THIS SHOULD BE REMOVED ONCE THE PROBLEM IS FIXED IN RMySQL!!!
    int_cols <- c("exon_seq_start", "exon_seq_end", "exon_idx", "tx_seq_start",
                  "tx_seq_end", "tx_cds_seq_start", "tx_cds_seq_end",
                  "gene_seq_start", "gene_seq_end", "seq_strand")
    for (the_col in int_cols) {
        if (any(colnames(Res) == the_col))
            if (!is.integer(Res[, the_col]))
                Res[, the_col] <- as.integer(Res[, the_col])
    }
    ## Resolving the "symlinks" again.
    if (any(columns == "tx_name"))
        Res <- data.frame(Res, tx_name = Res$tx_id, stringsAsFactors = FALSE)
    if(any(columns == "symbol"))
        Res <- data.frame(Res, symbol = Res$gene_name, stringsAsFactors = FALSE)
    ## Ensure that the ordering is as requested.
    Res <- Res[, columns, drop=FALSE]
    Res
}

############################################################
## Check database validity.
#' @description Return tables with attributes based on the provided schema.
#'
#' @noRd
.ensdb_tables <- function(version = "1.0") {
    .ENSDB_TABLES <- list(`1.0` = list(
                              gene = c("gene_id", "gene_name", "entrezid",
                                       "gene_biotype", "gene_seq_start",
                                       "gene_seq_end", "seq_name", "seq_strand",
                                       "seq_coord_system"),
                              tx = c("tx_id", "tx_biotype", "tx_seq_start",
                                     "tx_seq_end", "tx_cds_seq_start",
                                     "tx_cds_seq_end", "gene_id"),
                              tx2exon = c("tx_id", "exon_id", "exon_idx"),
                              exon = c("exon_id", "exon_seq_start",
                                       "exon_seq_end"),
                              chromosome = c("seq_name", "seq_length",
                                             "is_circular"),
                              metadata = c("name", "value")),
                          `2.0` = list(
                              gene = c("gene_id", "gene_name",
                                       "gene_biotype", "gene_seq_start",
                                       "gene_seq_end", "seq_name", "seq_strand",
                                       "seq_coord_system"),
                              tx = c("tx_id", "tx_biotype", "tx_seq_start",
                                     "tx_seq_end", "tx_cds_seq_start",
                                     "tx_cds_seq_end", "gene_id"),
                              tx2exon = c("tx_id", "exon_id", "exon_idx"),
                              exon = c("exon_id", "exon_seq_start",
                                       "exon_seq_end"),
                              chromosome = c("seq_name", "seq_length",
                                             "is_circular"),
                              entrezgene = c("gene_id", "entrezid"),
                              metadata = c("name", "value")),
                          `2.2` = list(
                              gene = c("gene_id", "gene_name",
                                       "gene_biotype", "gene_seq_start",
                                       "gene_seq_end", "seq_name", "seq_strand",
                                       "seq_coord_system", "description",
                                       "gene_id_version",
                                       "canonical_transcript"),
                              tx = c("tx_id", "tx_biotype", "tx_seq_start",
                                     "tx_seq_end", "tx_cds_seq_start",
                                     "tx_cds_seq_end", "tx_external_name",
                                     "gene_id"),
                              tx2exon = c("tx_id", "exon_id", "exon_idx"),
                              exon = c("exon_id", "exon_seq_start",
                                       "exon_seq_end"),
                              chromosome = c("seq_name", "seq_length",
                                             "is_circular"),
                              entrezgene = c("gene_id", "entrezid"),
                              metadata = c("name", "value"))
                          )
    .ENSDB_TABLES[[version]]
}
.ensdb_protein_tables <- function(version = "1.0") {
    .ENSDB_PROTEIN_TABLES <- list(`1.0` = list(
                                      protein = c("tx_id", "protein_id",
                                                  "protein_sequence"),
                                      uniprot = c("protein_id", "uniprot_id",
                                                  "uniprot_db",
                                                  "uniprot_mapping_type"),
                                      protein_domain = c("protein_id",
                                                         "protein_domain_id",
                                                         "protein_domain_source",
                                                         "interpro_accession",
                                                         "prot_dom_start",
                                                         "prot_dom_end")),
                                  `2.0` = list(
                                      protein = c("tx_id", "protein_id",
                                                  "protein_sequence"),
                                      uniprot = c("protein_id", "uniprot_id",
                                                  "uniprot_db",
                                                  "uniprot_mapping_type"),
                                      protein_domain = c("protein_id",
                                                         "protein_domain_id",
                                                         "protein_domain_source",
                                                         "interpro_accession",
                                                         "prot_dom_start",
                                                         "prot_dom_end")),
                                  `2.1` = list(
                                      protein = c("tx_id", "protein_id",
                                                  "protein_sequence"),
                                      uniprot = c("protein_id", "uniprot_id",
                                                  "uniprot_db",
                                                  "uniprot_mapping_type"),
                                      protein_domain = c("protein_id",
                                                         "protein_domain_id",
                                                         "protein_domain_source",
                                                         "interpro_accession",
                                                         "prot_dom_start",
                                                         "prot_dom_end"))
                                  )
    .ENSDB_PROTEIN_TABLES[[version]]
}

#' @description Extract the database schema version if available in the metadata
#'     database column.
#'
#' @param x Can be either a connection object or an \code{EnsDb} object.
#'
#' @noRd
dbSchemaVersion <- function(x) {
    if (is(x, "EnsDb")) {
        return(getProperty(x, "DBSCHEMAVERSION"))
    } else {
        tabs <- dbListTables(x)
        if (any(tabs == "metadata")) {
            res <- dbGetQuery(x, "select * from metadata")
            if (any(res$name == "DBSCHEMAVERSION") &
                any(colnames(res) == "value"))
                return(res[res$name == "DBSCHEMAVERSION", "value"])
        }
    }
    return("1.0")
}

dbHasRequiredTables <- function(con, returnError = TRUE,
                                tables = .ensdb_tables(dbSchemaVersion(con))) {
    tabs <- dbListTables(con)
    if (length(tabs) == 0) {
        if (returnError)
            return("Database does not have any tables!")
        return(FALSE)
    }
    not_there <- names(tables)[!(names(tables) %in% tabs)]
    if (length(not_there) > 0) {
        if (returnError)
            return(paste0("Required tables ", paste(not_there, collapse = ", "),
                          " are not present in the database!"))
        return(FALSE)
    }
    return(TRUE)
}
dbHasValidTables <- function(con, returnError = TRUE,
                             tables = .ensdb_tables(dbSchemaVersion(con))) {
    for (tab in names(tables)) {
        cols <- tables[[tab]]
        from_db <- colnames(dbGetQuery(con, paste0("select * from ", tab,
                                                   " limit 1")))
        not_there <- cols[!(cols %in% from_db)]
        if (length(not_there) > 0) {
            if (returnError)
                return(paste0("Table ", tab, " is missing required columns ",
                              paste(not_there, collapse = ", "), "!"))
            return(FALSE)
        }
    }
    return(TRUE)
}

############################################################
## feedEnsDb2MySQL
##
##
feedEnsDb2MySQL <- function(x, mysql, verbose = TRUE) {
    if (!inherits(mysql, "MariaDBConnection"))
        stop("'mysql' is supposed to be a connection to a MySQL database.")
    ## Fetch the tables and feed them to MySQL.
    sqlite_con <- dbconn(x)
    tabs <- dbListTables(sqlite_con)
    for (the_table in tabs) {
        if (verbose)
            message("Fetch table ", the_table, " ... ", appendLF = FALSE)
        tmp <- dbGetQuery(sqlite_con, paste0("select * from ", the_table, ";"))
        if (verbose)
            message("OK\nStoring the table in MySQL ... ", appendLF = FALSE)
        ## Fix tx_cds_seq_start being a character in old databases
        if (any(colnames(tmp) == "tx_cds_seq_start")) {
            suppressWarnings(
                tmp[, "tx_cds_seq_start"] <- as.integer(tmp[, "tx_cds_seq_start"])
            )
            suppressWarnings(
                tmp[, "tx_cds_seq_end"] <- as.integer(tmp[, "tx_cds_seq_end"])
            )
        }
        dbWriteTable(mysql, tmp, name = the_table, row.names = FALSE,
                     field.types = vapply(tmp, .mysql_datatype, character(1)))
        if (verbose)
            message("OK")
    }
    ## Create the indices.
    if (verbose)
        message("Creating indices...", appendLF = FALSE)
    .createEnsDbIndices(mysql, proteins = hasProteinData(x))
    if (verbose)
        message("OK")
    return(TRUE)
}

#' Instead of using the default dbDataType function we define here a more
#' fine-grained data type definition.
#'
#' @noRd
.mysql_datatype <- function(x) {
    res <- "TEXT"
    if (any(is.na(x)) & any(!is.na(x)))
        x <- x[!is.na(x)]
    if (is.character(x)) {
        nc <- max(nchar(x))
        if (nc < 65535)
            res <- paste0("VARCHAR(", nc, ")")
        else res <- "MEDIUMTEXT"
    }
    if (is.logical(x))
        res <- "TINYINT"
    if (is.numeric(x)) {
        if (is.integer(x)) {
            res <- "INT"
            if (any(x < -2147483648 | x > 2147483647))
                res <- "BIGINT"
            if (all(x > -8388608 & x < 8388607))
                res <- "MEDIUMINT"
            if (all(x > -32768 & x < 32767))
                res <- "SMALLINT"
            if (all(x > -128 & x < 127))
                res <- "TINYINT"
        } else res <- "DOUBLE"
    }
    res
}

#' Small helper function to create all the indices.
#'
#' @param con database connection.
#'
#' @param mysql `logical(1)` indicating whether indices specific for
#'     MariaDB/MySQL databases should be created.
#'
#' @param proteins `logical(1)` whether indices for protein tables should be
#'     created too.
#'
#' @noRd
.createEnsDbIndices <- function(con, proteins = FALSE) {
    indexCols <- c(chromosome = "seq_name", gene = "gene_id", gene = "gene_name",
                   gene = "seq_name", tx = "tx_id", tx = "gene_id",
                   exon = "exon_id", tx2exon = "tx_id", tx2exon = "exon_id")
    idxType <- c(chromosome = "unique", gene = "unique", gene = "", gene = "",
                 tx = "unique", tx = "", exon = "unique", tx2exon = "",
                 tx2exon = "")
    tbls <- dbListTables(con)
    if (any(tbls == "entrezgene")) {
        indexCols <- c(indexCols,
                       entrezgene = "gene_id", entrezgene = "entrezid")
        idxType <- c(idxType, entrezgene = "", entrezgene = "")
    }
    if (proteins) {
        indexCols <- c(indexCols, protein = "tx_id", protein = "protein_id",
                       uniprot = "protein_id", uniprot = "uniprot_id",
                       protein_domain = "protein_domain_id",
                       protein_domain = "protein_id")
        idxType <- c(idxType, protein = "", protein = "", uniprot = "",
                     uniprot = "", protein_domain = "", protein_domain = "")
    }
    for (i in 1:length(indexCols)) {
        tabname <- names(indexCols)[i]
        colname <- indexCols[i]
        idx_tp <- idxType[i]
        tmp <- dbExecute(con, paste0("create ", idx_tp, " index ",
                                     tabname, "_", colname, "_idx on ",
                                     tabname, " (", colname, ")"))
    }
    ## Add the one on the numeric index:
    aff_rows <- dbExecute(con, paste0("create index tx2exon_exon_idx_idx on ",
                                      "tx2exon (exon_idx);"))
}

############################################################
## listEnsDbs
## list databases
#' @title List EnsDb databases in a MariaDB/MySQL server
#'
#' @description
#'
#' The \code{listEnsDbs} function lists EnsDb databases in a
#' MariaDB/MySQL server.
#'
#' @details
#'
#' The use of this function requires the \code{RMariaDB} package
#' to be installed. In addition user credentials to access a MySQL server
#' (with already installed EnsDb databases), or with write access are required.
#' For the latter EnsDb databases can be added with the \code{\link{useMySQL}}
#' method. EnsDb databases in a MariaDB/MySQL server follow the same naming
#' conventions than EnsDb packages, with the exception that the name is all
#' lower case and that each \code{"."} is replaced by \code{"_"}.
#'
#' @param dbcon A \code{DBIConnection} object providing access to a
#'     MariaDB/MySQL database. Either \code{dbcon} or all of the other
#'     arguments have to be specified.
#'
#' @param host Character specifying the host on which the MySQL server is
#'     running.
#'
#' @param port The port of the MariaDB/MySQL server (usually \code{3306}).
#'
#' @param user The username for the MariaDB/MySQL server.
#'
#' @param pass The password for the MariaDB/MySQL server.
#'
#' @return A \code{data.frame} listing the database names, organism name
#'     and Ensembl version of the EnsDb databases found on the server.
#'
#' @author Johannes Rainer
#'
#' @seealso \code{\link{useMySQL}}
#'
#' @examples
#' \dontrun{
#' library(RMariaDB)
#' dbcon <- dbConnect(MariaDB(), host = "localhost", user = my_user, pass = my_pass)
#' listEnsDbs(dbcon)
#' }
listEnsDbs <- function(dbcon, host, port, user, pass) {
    if(requireNamespace("RMariaDB", quietly = TRUE)) {
        if (missing(dbcon)) {
            if (missing(host) | missing(user) | missing(port) | missing(host))
                stop("Arguments 'host', 'port', 'user' and 'pass' are required",
                     " if 'dbcon' is not specified.")
            dbcon <- dbConnect(RMariaDB::MariaDB(), host = host, port = port,
                               user = user, pass = pass)
        }
        dbs <- dbGetQuery(dbcon, "show databases;")
        edbs <- dbs[grep(dbs$Database, pattern = "^ensdb_"), "Database"]
        edbTable <- data.frame(matrix(ncol = 3, nrow = length(edbs)))
        colnames (edbTable) <- c("dbname", "organism", "ensembl_version")
        for (i in seq_along(edbs)) {
            edbTable[i, "dbname"] <- edbs[i]
            tmp <- unlist(strsplit(edbs[i], split = "_"))
            edbTable[i, "organism"] <- tmp[2]
            edbTable[i, "ensembl_version"] <- as.numeric(gsub(pattern = "v",
                                                              replacement = "",
                                                              tmp[3]))
        }
        return(edbTable)
    } else {
        stop("Required package 'RMariaDB' is not installed.")
    }
}

#' Simple helper that "translates" R logical operators to SQL.
#' @noRd
.logOp2SQL <- function(x) {
    if (x == "|")
        return("or")
    if (x == "&")
        return("and")
    return(NULL)
}
jorainer/ensembldb documentation built on Aug. 23, 2024, 1:16 p.m.