## x... optional, the database file name.
## v... verbosity
## function to load the database...
EnsDb <- function( x ){
options( useFancyQuotes=FALSE )
if( missing( x ) ){
stop( "No sqlite file provided!" )
}
lite <- dbDriver("SQLite")
con <- dbConnect(lite, dbname = x, flags=SQLITE_RO )
tables <- dbListTables( con )
## read the columns for these tables.
Tables <- vector( length=length( tables ), "list" )
for( i in 1:length( Tables ) ){
Tables[[ i ]] <- colnames( dbGetQuery( con, paste0( "select * from ", tables[ i ], " limit 1" ) ) )
}
names( Tables ) <- tables
EDB <- new( "EnsDb", ensdb=con, tables=Tables )
return( EDB )
}
## x is the connection to the database, name is the name of the entry to fetch
.getMetaDataValue <- function( x, name ){
return( dbGetQuery( x, paste0( "select value from metadata where name='", name, "'" ) )[ 1, 1] )
}
####
## Note: that's the central function that checks which tables are needed for the
## least expensive join!!! The names of the tables should then also be submitted
## to any other method that calls prefixColumns (e.g. where of the Filter classes)
##
## this function checks:
## a) for multi-table columns, selects the table with the highest degree
## b) pre-pend (inverse of append ;) ) the table name to the column name.
## returns a list, names being the tables and the values being the columns
## named: <table name>.<column name>
## clean: whether a cleanColumns should be called on the submitted columns.
## with.tables: force the prefix to be specifically on the submitted tables.
prefixColumns <- function( x, columns, clean=TRUE, with.tables ){
if( missing( columns ) )
stop( "columns is empty! No columns provided!" )
## first get to the tables that contain these columns
Tab <- listTables( x ) ## returns the tables by degree!
if( !missing( with.tables ) ){
with.tables <- with.tables[ with.tables %in% names( Tab ) ]
if( length( with.tables ) > 0 ){
Tab <- Tab[ with.tables ]
}else{
warning( "The submitted table names are not valid in the database and were thus dropped." )
}
if( length( Tab )==0 )
stop( "None of the tables submitted with with.tables is present in the database!" )
}
if( clean )
columns <- cleanColumns( x, columns )
if( length( columns )==0 ){
return( NULL )
}
## group the columns by table.
columns.bytable <- sapply( Tab, function( z ){
return( z[ z %in% columns ] )
}, simplify=FALSE, USE.NAMES=TRUE )
## kick out empty tables...
columns.bytable <- columns.bytable[ unlist( lapply( columns.bytable, function( z ){
return( length( z ) > 0)
} ) ) ]
if( length( columns.bytable )==0 )
stop( "No columns available!" )
have.columns <- NULL
## new approach! order the tables by number of elements, and after that, re-order them.
columns.bytable <- columns.bytable[ order( unlist( lapply( columns.bytable, length ) ),
decreasing=TRUE ) ]
## has to be a for loop!!!
## loop throught the columns by table and sequentially kick out columns for the current table if they where already
## in a previous (more relevant) table
## however, prefer also cases were fewer tables are returned.
for( i in 1:length( columns.bytable ) ){
bm <- columns.bytable[[ i ]] %in% have.columns
keepvals <- columns.bytable[[ i ]][ !bm ] ## keep those
if( length( keepvals ) > 0 ){
have.columns <- c( have.columns, keepvals )
}
if( length( keepvals ) > 0 ){
columns.bytable[[ i ]] <- paste( names( columns.bytable )[ i ], keepvals, sep="." )
}else{
columns.bytable[[ i ]] <- keepvals
}
}
## kick out those tables with no elements left...
columns.bytable <- columns.bytable[ unlist( lapply( columns.bytable, function( z ){
return( length( z ) > 0)
} ) ) ]
## re-order by degree.
columns.bytable <- columns.bytable[ tablesByDegree( x, names( columns.bytable ) ) ]
return( columns.bytable )
}
## define a function to create a join query based on columns
## this function has to first get all tables that contain the columns,
## and then select, for columns present in more than one
## x... EnsDb
## columns... the columns
joinQueryOnColumns <- function( x, columns ){
columns.bytable <- prefixColumns( x, columns )
## based on that we can build the query based on the tables we've got. Note that the
## function internally
## adds tables that might be needed for the join.
Query <- joinQueryOnTables( x, names( columns.bytable ) )
return( Query )
}
## only list direct joins!!!
.JOINS <- rbind(
c( "gene", "tx", "join tx on (gene.gene_id=tx.gene_id)" ),
c( "gene", "chromosome", "join chromosome on (gene.seq_name=chromosome.seq_name)" ),
c( "tx", "tx2exon", "join tx2exon on (tx.tx_id=tx2exon.tx_id)" ),
c( "tx2exon", "exon", "join exon on (tx2exon.exon_id=exon.exon_id)" )
)
## tx is now no 1:
## .JOINS <- rbind(
## c( "tx", "gene", "join gene on (tx.gene_id=gene.gene_id)" ),
## c( "gene", "chromosome", "join chromosome on (gene.seq_name=chromosome.seq_name)" ),
## c( "tx", "tx2exon", "join tx2exon on (tx.tx_id=tx2exon.tx_id)" ),
## c( "tx2exon", "exon", "join exon on (tx2exon.exon_id=exon.exon_id)" )
## )
joinQueryOnTables <- function( x, tab ){
## just to be on the save side: evaluate whether we have all required tables to join;
## this will also ensure that the order is by degree.
tab <- addRequiredTables( x, tab )
Query <- tab[ 1 ]
previous.table <- tab[ 1 ]
for( i in 1:length( tab ) ){
if( i > 1 ){
Query <- paste( Query, .JOINS[ .JOINS[ , 2 ]==tab[ i ], 3 ] )
}
}
return( Query )
}
###
## Add additional tables in case the submitted tables are not directly connected
## and can thus not be joined. That's however not so complicated, since the database
## layout is pretty simple.
## The tables are:
##
## exon -(exon_id=t2e_exon_id)- tx2exon -(t2e_tx_id=tx_id)- tx -(gene_id=gene_id)- gene
## |
## chromosome -(seq_name=seq_name)-ยด
addRequiredTables <- function( x, tab ){
## dash it, as long as I can't find a way to get connected objects in a
## graph I'll do it manually...
## if we have exon and any other table, we need definitely tx2exon
if( any( tab=="exon" ) & length( tab ) > 1 ){
tab <- unique( c( tab, "tx2exon" ) )
}
## if we have chromosome and any other table, we'll need gene
if( any( tab=="chromosome" ) & length( tab ) > 1 ){
tab <- unique( c( tab, "gene" ) )
}
## if we have exon and we have gene, we'll need also tx
if( ( any( tab=="exon" ) | (any( tab=="tx2exon" )) ) & any( tab=="gene" ) ){
tab <- unique( c( tab, "tx" ) )
}
return( tablesByDegree( x, tab ) )
}
.buildQuery <- function(x, columns, filter=list(), order.by="", order.type="asc",
group.by, skip.order.check=FALSE, return.all.columns=TRUE){
resultcolumns <- columns ## just to remember what we really want to give back
## 1) get all column names from the filters also removing the prefix.
if(class(filter)!="list")
stop("parameter filter has to be a list of BasicFilter classes!")
if(length(filter) > 0){
## check filter!
## add the columns needed for the filter
filtercolumns <- unlist(lapply(filter, column, x))
## remove the prefix (column name for these)
filtercolumns <- sapply(filtercolumns, removePrefix, USE.NAMES=FALSE)
columns <- unique(c(columns, filtercolumns))
}
## 2) get all column names for the order.by:
if( order.by!=""){
## if we have skip.order.check set we use the order.by as is.
if(!skip.order.check){
order.by <- unlist(strsplit(order.by, split=",", fixed=TRUE))
order.by <- gsub(order.by, pattern=" ", replacement="", fixed=TRUE)
## allow only order.by that are also in the columns.
order.by.nocolumns <- order.by[ !(order.by %in% columns) ]
order.by <- order.by[ order.by %in% columns ]
if(length(order.by.nocolumns) > 0){
warning("columns provided in order.by (",
paste(order.by.nocolumns, collapse=","),
") are not in columns and were thus removed." )
}
if(length(order.by)==0){
order.by <- ""
}
}
}else{
order.by <- ""
}
## Note: order by is now a vector!!!
## columns are now all columns that we want to fetch or that we need to
## filter or to sort.
## 3) check which tables we need for all of these columns:
need.tables <- names( prefixColumns( x, columns ) )
##
## Now we can begin to build the query parts!
## a) the query part that joins all required tables.
joinquery <- joinQueryOnColumns(x, columns=columns)
## b) the filter part of the query
if( length( filter ) > 0 ){
filterquery <- paste(" where",
paste(unlist(lapply(filter, where, x,
with.tables=need.tables)),
collapse=" and "))
}else{
filterquery <- ""
}
## c) the order part of the query
if( order.by!="" ){
if( !skip.order.check ){
order.by <- paste(unlist(prefixColumns(x=x, columns=order.by,
with.tables=need.tables),
use.names=FALSE), collapse=",")
}
orderquery <- paste( " order by", order.by, order.type )
}else{
orderquery <- ""
}
## And finally build the final query
if(return.all.columns){
resultcolumns <- columns
}
finalquery <- paste0("select distinct ",
paste(unlist(prefixColumns(x,
resultcolumns,
with.tables=need.tables),
use.names=FALSE), collapse=","),
" from ",
joinquery,
filterquery,
orderquery
)
return( finalquery )
}
## the buildQuery function; basically create the SQL query.
## x... EnsDb
## columns... the columns to retrieve.
## filter... list of filters
.buildQueryOld <- function( x, columns, filter=list(), order.by="", order.type="asc", group.by, skip.order.check=FALSE ){
resultcolumns <- columns ## just to remember what we really want to give back
if( !missing( filter ) ){
if( class( filter )!="list" )
stop( "parameter filter has to be a list of BasicFilter classes!" )
}
## first checking the filters and eventually add required columns to the columns:
if( length( filter ) > 0 ){
## check filter!
## add the columns needed for the filter
filtercolumns <- unlist( lapply( filter, column, x ) ) ## this returns
## the full named columns!
filtercolumns <- sapply( filtercolumns, removePrefix, USE.NAMES=FALSE )
columns <- unique( c( columns, filtercolumns ) )
## next we're building the where query.
wherequery <- paste( " where", paste( unlist( lapply( filter, where, x ) ),
collapse=" and " ) )
}else{
wherequery <- ""
}
## should we do an order.by?
if( !missing( order.by ) & order.by!="" ){
## if we have skip.order.check set we use the order.by as is.
if( !skip.order.check ){
order.by <- unlist( strsplit( order.by, split=",", fixed=TRUE ) )
order.by <- gsub( order.by, pattern=" ", replacement="", fixed=TRUE )
## allow only order.by that are also in the columns.
order.by.nocolumns <- order.by[ !( order.by %in% columns ) ]
order.by <- order.by[ order.by %in% columns ]
if( length( order.by.nocolumns ) > 0 ){
warning( "columns provided in order.by (",
paste( order.by.nocolumns, collapse=","),
") are not in columns and were thus removed." )
}
if( length( order.by )==0 ){
order.by <- ""
}else{
## have to pre-pend the database table name...
order.by <- paste( unlist( prefixColumns( x=x, columns=order.by ), use.names=FALSE ), collapse="," )
}
}
}else{
order.by <- ""
}
## OK, build that query.
if( order.by!="" ){
orderquery <- paste( " order by", order.by, order.type )
}else{
orderquery <- ""
}
## now build the join query that joins all required tables.
joinquery <- joinQueryOnColumns( x, columns=columns )
finalquery <- paste0( "select distinct ",
paste( unlist( prefixColumns( x, resultcolumns ),
use.names=FALSE ), collapse="," ),
" from ",
joinquery,
wherequery,
orderquery
)
return( finalquery )
}
## remove the prefix again...
removePrefix <- function( x, split=".", fixed=TRUE ){
return( sapply( x, function( z ){
tmp <- unlist( strsplit( z, split=split, fixed=fixed ) )
return( tmp[ length( tmp ) ] )
} ) )
}
## just to add another layer; basically just calls buildQuery and executes the query
.getWhat <- function( x, columns, filter=list(), order.by="",
order.type="asc", group.by, skip.order.check=FALSE ){
## build the query
Q <- .buildQuery( x=x, columns=columns, filter=filter,
order.by=order.by, order.type=order.type, group.by=group.by,
skip.order.check=skip.order.check )
## get the data
Res <- dbGetQuery( dbconn( x ), Q )
if( any( columns=="tx_cds_seq_start" ) ){
suppressWarnings(
## column contains "NULL" if not defined and coordinates are characters
## as.numeric transforms "NULL" into NA, and ensures coords are numeric.
Res[ , "tx_cds_seq_start" ] <- as.numeric( Res[ , "tx_cds_seq_start" ] )
)
}
if( any( columns=="tx_cds_seq_end" ) ){
suppressWarnings(
## column contains "NULL" if not defined and coordinates are characters
## as.numeric transforms "NULL" into NA, and ensures coords are numeric.
Res[ , "tx_cds_seq_end" ] <- as.numeric( Res[ , "tx_cds_seq_end" ] )
)
}
return( Res )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.