select: Retrieve records from a dbframe
In grayclhn/dbframe-R-library: An R to SQL interface

Description Usage Arguments Source code Unit Tests References Examples

select is a wrapper for the sql select query and is used to retrieve records from a dbframe object. generate.select.sql assembles a valid sql select statement from its arguments. The arguments map to clauses in the sql select statement, so you may need to consult an introduction to sql to best use these functions.

select(x, cols, as.data.frame = TRUE,...)

generate.select.sql(table, cols = "*", where = NULL, group.by = NULL, 
                    having = NULL, order.by = NULL, limit = NULL,...)

`x`	A dbframe object that references the table of interest.
`table`	A character object containing the name of the sql table to query.
`cols`	A character vector containing the column names (or functions of the column names) to retrieve from the database.
`where`	A character object that contains conditions to use to filter the records.
`group.by`	A character vector that defines groups of records to combine with an aggregate function.
`having`	A character object that filters the groups defined by “group.by”.
`order.by`	A character vector that lists the columns to be used for sorting the results.
`limit`	A character vector or number that limits and offsets the sql query results.
`...`	Additional arguments to pass to dbGetQuery.
`as.data.frame`	Logical; if TRUE, execute the query and return the results as a data frame. If FALSE, return a dbframe that has the sql statement for the query as its “table”

The inidividual methods do some minor parsing, but most of the work is done by generate.select.sql.

<<*>>=
    generate.select.sql <- function(table, cols = "*", where = NULL, 
      group.by = NULL, having = NULL, order.by = NULL, limit = NULL, ...) {

      <<Format the "select" part of the statement>>
      <<Format the "group by" part of the statement>>
      <<Format the "order by" part of the statement>>
      <<Format the "having" part of the statement>>
      <<Format the "where" part of the statement>>
      <<Format the "limit" part of the statement>>
      return(paste("select", cols, "from", table, where, 
                   group.by, having, order.by, limit))
    }

    setGeneric("select", function(x, cols, as.data.frame = TRUE,...)
               standardGeneric("select"))

    setMethod("select", signature = c("ANY", "missing"), 
              function(x, cols, as.data.frame = TRUE,...) {
                <<Execute select for c("ANY", "missing")>>})

    setMethod("select", signature = c("dbframe", "character"), 
              function(x, cols, as.data.frame = TRUE,...) {
                <<Execute select for c("dbframe", "character")>>})

    setMethod("select", signature = c("data.frame", "character"),
              function(x, cols, as.data.frame = TRUE,...) {
                <<Handle arguments and set up the local environment>>
                <<Export the new data frame to a temporary SQLite database>>
                <<Query the new database and close database connection>>
                return(queryresults)
              })

    setMethod("select", signature = c("list", "character"), 
              function(x, cols,...) {
                <<Detect inappropriate uses of the "list" method>>
                <<Define and attach to "main" db>>
                <<Extract the arguments that describe the join>>
                <<Execute the query and return its results>>
              })

    ## setMethod("select", signature = c("dbframe", "list"), 
    ##           function(x, cols,...) {
    ##             <Handle lists of a single query element>>
    ##             <Manage arguments for compound queries>>
    ##             <Construct individual SQL select statements for compound queries>>
    ##             <Execute query and return data>>
    ##           })
      
      
    <<Define additional useful functions>>

Unfortunately, this method doesn't yet allow for joins or compound queries involving data.frames. Maybe a better approach would be to always let the “main” database be a temporary one on disk.

Assembling the sql select statement

To write the "select" part, we add the group.by variables and the cols variables together (and store them in cols).

<<Format the "select" part of the statement>>=
    <<Add new group.by variables in front of cols variables>>
    labels <- names(cols)
    labels[nchar(labels) > 0] <- paste("AS", labels[nchar(labels) > 0])
    cols <- paste(cols, labels, collapse = ", ")

To save typing, I assume that we want to retrieve the grouping variables and so we don't have to specify them explicitly in the cols vector. I think it makes sense to have the grouping variables on the left side of the results set instead of the right side. The next code chunk does both of those.

<<Add new group.by variables in front of cols variables>>=
    cols <-
      if (is.null(cols)) {
        group.by
      } else if (is.null(group.by)) {
        cols
      } else {
        if (is.null(names(cols)))
          names(cols) <- rep("", length(cols))
        if (is.null(names(group.by))) 
          names(group.by) <- rep("", length(group.by))
        c(group.by[!(names(group.by) %in% names(cols))
                    |  nchar(names(group.by)) == 0], cols)
      }

Managing the other arguments is easy. If they're NULL we replace the variable with an empty string; if they're not, we add the appropriate label and replace the variable with a character object that contains a clause for the sql statement.

<<Format the "group by" part of the statement>>=
    group.by <- 
      if (is.null(group.by)) {
        "" 
      } else {
        paste("group by", paste(group.by, collapse = ", "))
      }

<<Format the "order by" part of the statement>>=
    order.by <- 
      if (is.null(order.by)) {
        ""
      } else {
        paste("order by", paste(order.by, collapse = ", "))
      }

<<Format the "having" part of the statement>>=
    having <-
      if (is.null(having)) {
        ""
      } else {
        paste("having", having)
      }

<<Format the "where" part of the statement>>=
    where <-
      if (is.null(where)) {
        "" 
      } else {
        paste("where", where)
      }

<<Format the "limit" part of the statement>>=
    limit <- 
      if (is.null(limit)) {
        "" 
      } else {
        paste("limit", limit)
      }

Details of argument handling for simple queries

The individual methods just call generate.select.sql and execute the select statement. If cols is "missing" it returns results for cols equal to "*" (i.e. all of the columns of the table).

1 2	<<Execute select for c("ANY", "missing")>>= select(x, "*", as.data.frame,...)

<<Execute select for c("dbframe", "character")>>=
    if (!is.linked(x)) {
      warning("Table does not exist in the data base")
      return(list())
    }
    arguments <- list(table = tablename(x), cols = cols,...)
    sql.statement <- do.call("generate.select.sql", arguments)
    if (as.data.frame) {
      dbc <- dbConnect(x)
      d <- do.call("dbGetQuery", c(conn = dbc, statement = sql.statement,
                                   arguments))
      dbDisconnect(dbc)
    } else {
      if (is.null(arguments$readonly)) {
        readonly <- readonly(x)
      } else {
        readonly <- arguments$readonly
        arguments$readonly <- NULL
      }
      d <- do.call("new", c(Class = "dbframe", table = sql.statement,
                   readonly = readonly, dbConnect.arguments = arguments))
    }
    return(d)

<<Handle arguments and set up the local environment>>=
    if (!as.data.frame)
      warning("'as.data.frame' ignored when selecte is called on a data.frame.")
    tablename <- "dataframe"
    require(RSQLite)
    require(RSQLite.extfuns)

1
2
3

<<Export the new data frame to a temporary SQLite database>>=
    dbc <- dbConnect("SQLite", dbname = ":memory:")
    dbWriteTable(dbc, tablename, x, row.names = FALSE)

<<Query the new database and close database connection>>=
    sql.statement <- generate.select.sql(tablename, cols,...)
    queryresults <- dbGetQuery(dbc, sql.statement)
    dbDisconnect(dbc)

Details of argument handling for joins

So far, I'm only supporting joins for dbframes that are linked to sqlite data bases and for data.frames.

<<Detect inappropriate uses of the "list" method>>=
    if (length(x) == 1) return(select(x[[1]], cols,...))
    if (is.null(names(x))) names(x) <- LETTERS[seq_along(x)]
    tableclasses <- sapply(x, class)
    if (!all(tableclasses %in% c("dbframe_sqlite", "data.frame")))
      stop("Some of your dbframes aren't supported yet")
    if (any(tableclasses == "data.frame")) {
      require(RSQLite)
      require(RSQLite.extfuns)
    }

One nice feature of this function is that it handles all of the “attach” commands that are necessary to merge tables that exist in different databases. If all of the dbframes link to the same database, then that one will obviously be the main database; otherwise we connect to a temporary sqlite database and attach everything there.

<<Define and attach to "main" db>>=
    dbnames <- tablenames <- rep(NA, length(x))
    for (s in seq_along(x)) {
      if (tableclasses[s] == "dbframe_sqlite") {
        dbnames[s] <- dbname(x[[s]])
        tablenames[s] <- tablename(x[[s]])
      } else {
        dbnames[s] <- "temp"
        tablenames[s] <- names(x)[[s]]
      }
    }
    not.data.frames <- which(tableclasses != "data.frame")
    dbalias <- dbnames
    if (isTRUE(sum(!duplicated(dbnames[not.data.frames])) == 1)) {
      maindbc <- dbConnect(x[[not.data.frames[1]]])
      dbnames[not.data.frames] <- "main"
      dbalias[not.data.frames] <- "main"
    } else {
      maindbc <- dbConnect("SQLite", dbname = ":memory:")
      <<Attach sqlite_dbframes to the main db>> 
    }
    <<Write dataframes to the main db>>

Any data frames are just going to be written to the temporary database.

<<Write dataframes to the main db>>=
    sapply(which(tableclasses == "data.frame"), function(s) 
           dbWriteTable(maindbc, paste("temp", tablenames[s], sep = "."),
                                                      x[[s]], row.names = FALSE))

Tables that already exist in other databases are attached to the temporary database.

<<Attach sqlite_dbframes to the main db>>=
    dbcount <- 0
    unique.databases <- unique(dbnames[!(dbnames %in% c("temp", "main"))])
    for (db in unique.databases) {
      dbcount <- dbcount + 1
      currentalias <- sprintf("ALIAS%d", dbcount)
      dbalias[dbalias == db] <- currentalias
      r <- dbSendQuery(maindbc, sprintf("attach database '%s' as %s", db, currentalias))
      dbClearResult(r)
    }

The columns are already specified for the query; the only thing to do is assemble the sql code for the "table" part. If the join type is not specified, the default is to do an inner join; there is no default for “on” or “using”, so one (and only one) of those arguments must be specified.

<<Extract the arguments that describe the join>>=
    arguments <- list(...)
    join  <- extract.element("join", "inner", length(x) - 1, arguments)
    on    <- extract.element("on", NA, length(x) - 1, arguments)
    using <- extract.element("using", NA, length(x) - 1, arguments)
    if (any(is.na(on) & is.na(using)))
      stop("'on' and 'using' can't both be specified for the same join.")
    arguments$join  <- NULL
    arguments$on    <- NULL
    arguments$using <- NULL
    arguments$cols  <- cols
    arguments$table <- paste(collapse = " ", c(
      sprintf("%s.%s %s", dbalias[1], tablenames[1], names(x)[1]),
      sprintf("%s join %s.%s %s %s", join, dbalias[-1], tablenames[-1], 
                                                                   names(x)[-1],
              ifelse(is.na(on), ifelse(is.na(using), "", 
                          sprintf("using(%s)", using)), sprintf("on %s", on)))))

Actually executing the query is the same as for the other methods. We're not going to worry about detaching the tables explicitly, since we know that they were only attached if we're using a temporary data base.

<<Execute the query and return its results>>=
    results <- dbGetQuery(maindbc, do.call(generate.select.sql, arguments))
    dbDisconnect(maindbc)
    return(results)

Miscellaneous function

<<Define additional useful functions>>=
    extract.element <- function(name, default, length.required, argument.list) {
      v <- if (name %in% names(argument.list)) argument.list[[name]] else default
      if (is.na(length.required) | length(v) == length.required) return(v)
      else if (length(v) == 1) return(rep(v, length.required))
      else stop("Incorrect length of argument")
    }

I just have some basic sanity-check type unit tests; i.e. do the functions run at a minimal level.

<<test-select.R>>=
    library(testthat)
    data(chickwts)
    chickwts$feed <- as.character(chickwts$feed)
    test_that("insert and select work", {
      <<Individual tests that insert and select work>>})
    test_that("column renaming scheme works", {
      <<Individual tests that column renaming works>>})
    test_that("joins work", {
      <<Individual tests that joins work>>})

First we'll check that the methods defined on data frames work, then that they work for dbframes.

<<Individual tests that insert and select work>>=
    expect_that(chickwts, is_equivalent_to(select(chickwts)))
    <<Create temporary test database and dbframe>>
    insert(testdbframe) <- chickwts
    expect_that(chickwts, is_equivalent_to(select(testdbframe)))
    <<Remove temporary test database>>

<<Individual tests that column renaming works>>=
    expect_that(
      c("feed", "AverageWeight"), 
      is_identical_to(names(select(chickwts, 
           c(AverageWeight = "avg(weight)"), group.by = "feed"))))
    <<Create temporary test database and dbframe>>
    insert(testdbframe) <- chickwts
    expect_that(
      c("feed", "AverageWeight"), 
      is_identical_to(names(select(testdbframe, 
           c(AverageWeight = "avg(weight)"), group.by = "feed"))))
    <<Remove temporary test database>>

<<Individual tests that joins work>>=
    <<Create temporary test database and dbframe>>
    expect_that(select(list(A = chickwts,B =  chickwts), 
                    c("feed", weightA = "A.weight", weightB = "B.weight"), 
                    using = "feed", order.by = c("feed", "weightA", "weightB")),
                equals({
                  d <- merge(chickwts, chickwts, by = "feed", 
                                                         suffixes = c("A", "B"))
                  d$feed <- as.character(d$feed)
                  d[do.call(order, d),]
                }, check.attributes = FALSE))

    avgwts <- dbframe("select2", dbname = testdbfile, clear = TRUE,
                       data = select(chickwts, c(averageweight = "avg(weight)"),
                                                group.by = c(thefeed = "feed")))
    expect_that(select(list(a = chickwts, b = avgwts), 
                       c("feed", "weight", "averageweight"),
                       on = ("feed = thefeed"), order.by = "feed, weight"),
                equals({
                  d <- merge(chickwts, select(avgwts), by.x = "feed", 
                                                               by.y = "thefeed")
                  d$feed <- as.character(d$feed)
                  d[do.call(order, d),]
                }, check.attributes = FALSE))
    <<Remove temporary test database>>

<<Create temporary test database and dbframe>>=
    testdbfile <- tempfile(fileext = ".db")    
    testdbframe <- dbframe("select1", testdbfile)
    clear(testdbframe)

1 2	<<Remove temporary test database>>= unlink(testdbfile)

sql as understood by sqlite. http://www.sqlite.org/lang_select.html

filename <- tempfile(fileext = ".db")
data(chickwts)
chicksdb <- dbframe("select1", dbname = filename, 
                    clear = TRUE, data = chickwts)
select(chicksdb, where = "weight > 200", order.by = "weight")
select(chicksdb, c(averageweight = "avg(weight)"), group.by = "feed")
select(chicksdb, c(averageweight = "avg(weight)"), group.by = "feed",
       having = "averageweight > 250")

## and an example of querying the data frame directly
select(chickwts, c(averageweight = "avg(weight)"), 
       group.by = c(thefeed = "feed"))
avgwts <- dbframe("select2", dbname = filename, clear = TRUE,
                  data = select(chickwts, c(averageweight = "avg(weight)"), 
                                group.by = c(thefeed = "feed")))
## an example of a join
select(list(a = chicksdb, b = avgwts), c("feed", "weight", "averageweight"),
       on = ("feed = thefeed"), order.by = "feed, weight")