#' SQL-backed DataFrame
#'
#' Create a SQL-backed \linkS4class{DataFrame}, where the data are
#' kept on disk until requested. Direct extension classes are
#' \code{SQLiteDataFrame} and \code{DuckDBDataFrame}.
#'
#' @param path String containing a path to a SQL file.
#' @param dbtype String containing the SQL database type (case
#' insensitive). Supported types are "SQLite" and "DuckDB".
#' @param table String containing the name of SQL table.
#' @param columns Character vector containing the names of columns in
#' a SQL table. If \code{NULL}, this is determined from
#' \code{path}.
#' @param nrows Integer scalar specifying the number of rows in a SQL
#' table. If \code{NULL}, this is determined from \code{path}.
#' @return A SQLDataFrame where each column is a
#' \linkS4class{SQLColumnVector}.
#'
#' @details The SQLDataFrame is essentially just a
#' \linkS4class{DataFrame} of \linkS4class{SQLColumnVector}
#' objects. It is primarily useful for indicating that the
#' in-memory representation is consistent with the underlying
#' SQL file (e.g., no delayed filter/mutate operations have
#' been applied, no data has been added from other files). Thus,
#' users can specialize code paths for a SQLDataFrame to
#' operate directly on the underlying SQL table.
#'
#' In that vein, operations on a SQLDataFrame may return another
#' SQLDataFrame if the operation does not introduce inconsistencies
#' with the file-backed data. For example, slicing or combining by
#' column will return a SQLDataFrame as the contents of the retained
#' columns are unchanged. In other cases, the SQLDataFrame will
#' collapse to a regular \linkS4class{DFrame} of
#' \linkS4class{SQLColumnVector} objects before applying the
#' operation; these are still file-backed but lack the guarantee of
#' file consistency.
#'
#' @author Qian Liu
#' @examples
#'
#' ## Mocking up a file:
#'
#' ### SQLite
#' tf <- tempfile()
#' on.exit(unlink(tf))
#' con <- DBI::dbConnect(RSQLite::SQLite(), tf)
#' DBI::dbWriteTable(con, "mtcars", mtcars)
#' DBI::dbDisconnect(con)
#'
#' ### DuckDB
#' tf1 <- tempfile()
#' on.exit(unlist(tf1))
#' con <- DBI::dbConnect(duckdb::duckdb(), tf1)
#' DBI::dbWriteTable(con, "mtcars", mtcars)
#' DBI::dbDisconnect(con)
#'
#' ## Creating a SQLite-backed data frame:
#'
#' df <- SQLDataFrame(tf, dbtype = "SQLite", table = "mtcars")
#' df1 <- SQLiteDataFrame(tf, "mtcars")
#' identical(df, df1)
#'
#' ## DuckDB-backed data frame:
#' df2 <- SQLDataFrame(tf1, dbtype = "duckdb", table = "mtcars")
#' df3 <- DuckDBDataFrame(tf1, "mtcars")
#' identical(df2, df3)
#' ## Extraction yields a SQLiteColumnVector:
#' df$carb
#'
#' ## Some operations preserve the SQLDataFrame:
#' df[,1:5]
#' combined <- cbind(df, df)
#' class(combined)
#'
#' ## ... but most operations collapse to a regular DFrame:
#' df[1:5,]
#' combined2 <- cbind(df, some_new_name=df[,1])
#' class(combined2)
#'
#' df1 <- df
#' rownames(df1) <- paste0("row", seq_len(nrow(df1)))
#' class(df1)
#'
#' df2 <- df
#' colnames(df2) <- letters[1:ncol(df2)]
#' class(df2)
#'
#' df3 <- df
#' df3$carb <- mtcars$carb
#' class(df3)
#'
#' ## Utility functions
#' path(df)
#' dbtype(df)
#' sqltable(df)
#' dim(df)
#' names(df)
#'
#' as.data.frame(df)
#'
#' @rdname SQLDataFrame
#' @aliases
#' SQLDataFrame-class
#'
#' nrow,SQLDataFrame-method
#' ncol,SQLDataFrame-method
#' length,SQLDataFrame-method
#' path,SQLDataFrame-method
#'
#' rownames,SQLDataFrame-method
#' names,SQLDataFrame-method
#' rownames<-,SQLDataFrame-method
#' names<-,SQLDataFrame-method
#'
#' extractROWS,SQLDataFrame,ANY-method
#' extractCOLS,SQLDataFrame-method
#' [[,SQLDataFrame-method
#'
#' replaceROWS,SQLDataFrame-method
#' replaceCOLS,SQLDataFrame-method
#' normalizeSingleBracketReplacementValue,SQLDataFrame-method
#' [[<-,SQLDataFrame-method
#'
#' cbind,SQLDataFrame-method
#' cbind.SQLDataFrame
#'
#' as.data.frame,SQLDataFrame-method
#' coerce,SQLDataFrame,DFrame-method
#'
#' @export
SQLDataFrame <- function(path, dbtype=NULL, table=NULL, columns=NULL, nrows=NULL) {
if (is.null(dbtype))
stop("Please specify the SQL database type: sqlite, duckdb.")
dbtype <- switch(tolower(dbtype),
sqlite = "SQLite",
duckdb = "DuckDB")
con <- acquireConn(path, dbtype)
if (is.null(table)) {
tbls <- DBI::dbListTables(con)
stop("Please specify a table name. \n Available tables are: ",
paste(tbls, collapse=", "))
}
if (is.null(columns) || is.null(nrows)) {
if (is.null(columns)) {
columns <- DBI::dbListFields(con, table)
}
if (is.null(nrows)) {
nrows <- as.integer(DBI::dbGetQuery(con, paste0("SELECT COUNT(", columns[1], ") FROM ", table))[,1])
}
}
new("SQLDataFrame", path=path, dbtype = dbtype, table = table, columns=columns, nrows=nrows)
}
#' @export
setClass("SQLDataFrame", contains="DataFrame",
slots=c(path="character", dbtype="character", table="character", columns="character", nrows="integer"))
#' @export
setMethod("nrow", "SQLDataFrame", function(x) x@nrows)
#' @export
setMethod("length", "SQLDataFrame", function(x) length(x@columns))
#' @export
setMethod("path", "SQLDataFrame", function(object) object@path)
#' @export
setMethod("sqltable", "SQLDataFrame", function(x) x@table)
#' @export
setMethod("dbtype", "SQLDataFrame", function(x) x@dbtype)
#' @export
setMethod("rownames", "SQLDataFrame", function(x) NULL)
#' @export
setMethod("names", "SQLDataFrame", function(x) x@columns)
#' @export
setReplaceMethod("rownames", "SQLDataFrame", function(x, value) {
if (!is.null(value)) {
x <- .collapse_to_df(x)
rownames(x) <- value
}
x
})
#' @export
setReplaceMethod("names", "SQLDataFrame", function(x, value) {
if (!identical(value, names(x))) {
x <- .collapse_to_df(x)
names(x) <- value
}
x
})
#' @export
#' @importFrom S4Vectors extractROWS
setMethod("extractROWS", "SQLDataFrame", function(x, i) {
if (!missing(i)) {
collapsed <- .collapse_to_df(x)
extractROWS(collapsed, i)
} else {
x
}
})
#' @export
#' @importFrom stats setNames
#' @importFrom S4Vectors extractCOLS normalizeSingleBracketSubscript
setMethod("extractCOLS", "SQLDataFrame", function(x, i) {
if (!missing(i)) {
xstub <- setNames(seq_along(x), names(x))
i <- normalizeSingleBracketSubscript(i, xstub)
x@columns <- names(x)[i]
x@elementMetadata <- extractROWS(x@elementMetadata, i)
}
x
})
#' @export
#' @importFrom S4Vectors normalizeDoubleBracketSubscript
setMethod("[[", "SQLDataFrame", function(x, i, j, ...) {
if (!missing(j)) {
stop("list-style indexing of a SQLDataFrame with non-missing 'j' is not supported")
}
if (missing(i) || length(i) != 1L) {
stop("expected a length-1 'i' for list-style indexing of a SQLDataFrame")
}
i <- normalizeDoubleBracketSubscript(i, x)
SQLColumnVector(path(x), dbtype(x), sqltable(x), column=names(x)[i])
})
#' @export
#' @importFrom S4Vectors replaceROWS
setMethod("replaceROWS", "SQLDataFrame", function(x, i, value) {
x <- .collapse_to_df(x)
replaceROWS(x, i, value)
})
#' @export
#' @importFrom S4Vectors normalizeSingleBracketReplacementValue
setMethod("normalizeSingleBracketReplacementValue", "SQLDataFrame", function(value, x) {
if (is(value, "SQLColumnVector")) {
return(new("SQLDataFrame", path=path(value@seed),
dbtype = dbtype(value@seed),
table = sqltable(value@seed),
columns=value@seed@column,
nrows=length(value)))
}
callNextMethod()
})
#' @export
#' @importFrom stats setNames
#' @importFrom S4Vectors replaceCOLS normalizeSingleBracketSubscript
setMethod("replaceCOLS", "SQLDataFrame", function(x, i, value) {
xstub <- setNames(seq_along(x), names(x))
i2 <- normalizeSingleBracketSubscript(i, xstub, allow.NAs=TRUE)
if (length(i2) == 1L && !is.na(i2)) {
if (is(value, "SQLDataFrame")) {
if (path(x) == path(value) &&
dbtype(x) == dbtype(value) &&
sqltable(x) == sqltable(value) &&
identical(names(x)[i2], names(value))) {
return(x)
}
}
}
# In theory, it is tempting to return a SQLDataFrame; the problem is
# that assignment will change the mapping of column names to their
# contents, so it is no longer a pure representation of a SQLDataFrame.
x <- .collapse_to_df(x)
replaceCOLS(x, i, value)
})
#' @export
#' @importFrom S4Vectors normalizeDoubleBracketSubscript
setMethod("[[<-", "SQLDataFrame", function(x, i, j, ..., value) {
i2 <- normalizeDoubleBracketSubscript(i, x, allow.nomatch=TRUE)
if (length(i2) == 1L && !is.na(i2)) {
if (is(value, "SQLColumnVector")) {
if (path(x) == path(value@seed) &&
dbtype(x) == dbtype(value@seed) &&
sqltable(x) == sqltable(value@seed) &&
names(x)[i2] == value@seed@column) {
return(x)
}
}
}
x <- .collapse_to_df(x)
x[[i]] <- value
x
})
#' @export
#' @importFrom S4Vectors mcols make_zero_col_DFrame combineRows
cbind.SQLDataFrame <- function(..., deparse.level=1) {
preserved <- TRUE
all_columns <- character(0)
objects <- list(...)
xpath <- NULL
xtable <- NULL
xdbtype <- NULL
for (i in seq_along(objects)) {
obj <- objects[[i]]
if (is(obj, "SQLDataFrame")) {
if (is.null(xpath)) {
xpath <- path(obj)
xtable <- sqltable(obj)
xdbtype <- dbtype(obj)
} else if (path(obj) != xpath ||
dbtype(obj) != xdbtype ||
sqltable(obj) != xtable) {
preserved <- FALSE
break
}
all_columns <- c(all_columns, names(obj))
} else if (is(obj, "SQLColumnVector")) {
if (is.null(xpath)) {
xpath <- path(obj@seed)
xtable <- sqltable(obj@seed)
xdbtype <- dbtype(obj@seed)
} else if (path(obj@seed) != xpath ||
dbtype(obj@seed) != xdbtype ||
sqltable(obj@seed) != xtable ||
!identical(names(objects)[i], obj@seed@column)) {
preserved <- FALSE
break
}
all_columns <- c(all_columns, obj@seed@column)
} else {
preserved <- FALSE
break
}
}
if (!preserved) {
for (i in seq_along(objects)) {
obj <- objects[[i]]
if (is(obj, "SQLDataFrame")) {
objects[[i]] <- .collapse_to_df(obj)
}
}
do.call(cbind, objects)
} else {
all_mcols <- list()
has_mcols <- FALSE
all_metadata <- list()
for (i in seq_along(objects)) {
obj <- objects[[i]]
mc <- NULL
md <- list()
if (is(obj, "DataFrame")) {
mc <- mcols(obj, use.names=FALSE)
md <- metadata(obj)
if (is.null(mc)) {
mc <- make_zero_col_DFrame(length(obj))
} else {
has_mcols <- TRUE
}
} else {
mc <- make_zero_col_DFrame(1)
}
all_mcols[[i]] <- mc
all_metadata[[i]] <- md
}
if (has_mcols) {
all_mcols <- do.call(combineRows, all_mcols)
} else {
all_mcols <- NULL
}
new("SQLDataFrame",
path=xpath,
dbtype=xdbtype,
table=xtable,
columns=all_columns,
nrows=NROW(objects[[1]]),
elementMetadata=all_mcols,
metadata=do.call(c, all_metadata)
)
}
}
#' @export
#' @importFrom S4Vectors bindCOLS
setMethod("cbind", "SQLDataFrame", cbind.SQLDataFrame)
#' @importFrom S4Vectors make_zero_col_DFrame mcols mcols<- metadata metadata<-
.collapse_to_df <- function(x) {
df <- make_zero_col_DFrame(x@nrows)
for (i in seq_along(names(x))) {
df[[as.character(i)]] <- SQLColumnVector(path(x), dbtype(x), sqltable(x), column=names(x)[i])
}
colnames(df) <- names(x)
mcols(df) <- mcols(x, use.names=FALSE)
metadata(df) <- metadata(x)
df
}
#' @export
setMethod("as.data.frame", "SQLDataFrame", function(x, row.names = NULL, optional = FALSE, ...) {
con <- acquireConn(path(x), dbtype(x))
ucol <- unique(x@columns)
tab <- DBI::dbGetQuery(con, paste0("SELECT ", paste(ucol, collapse=","), " FROM ", sqltable(x)))
output <- as.data.frame(tab, row.names=row.names, optional=optional, ...)
output <- output[,match(x@columns,colnames(output)),drop=FALSE]
output
})
#' @export
setAs("SQLDataFrame", "DFrame", function(from) .collapse_to_df(from))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.