Nothing
#' sqlscore: Utilities to score GLMs and related models in SQL.
#'
#' The sqlscore package provides utilities for generating sql queries
#' (particularly CREATE TABLE statements) from R model objects. The most important
#' use case is generating SQL to score a GLM or related model represented as an R
#' object, in which case the package handles parsing formula operators and
#' including the model's response function. The models scored need not be generalized
#' linear models, strictly speaking, but their prediction steps must consist of applying
#' a response function to a linear predictor. The package handles escaping and dealing
#' with formula operators, and provides a way to use a custom response function if desired.
#'
#' @section Function overview:
#' The SQL-generating functions create_statement and select_statement do what their
#' names suggest and generate CREATE TABLE and SELECT statements for model scoring.
#' Helper functions include linpred(), which generates an R call object representing
#' the linear predictor, and score_expression, an S3 generic that handles wrapping
#' the linear predictor in the response function.
#'
#' @section Supported models:
#' Specific packages and models that are known to work include: glm and lm from
#' package:stats, cv.glmnet from package:glmnet, glmboost from package:mboost,
#' and bayesglm from package:arm.
#'
#' Default S3 methods are for objects structured like those of class "glm", so
#' models not listed here may work if they resemble those objects, but are not
#' guaranteed to.
#'
#' @docType package
#' @name sqlscore
NULL
# Fully qualify and escape a table name given in parts.
#
# Given the three canonical parts of a database table name (catalog, schema
# and table), escape all three and join them together to produce the fully
# qualified table name. Only the table name is required.
#
# An open database connection can be passed as the `con` argument, or the
# `dbplyr::simulate_*` functions can be used in applications which don't have
# a DB connection when they need to generate SQL.
#'
# @param table The unqualified table name.
# @param catalog The catalog name.
# @param schema The schema name.
# @param con A DBI connection to control the details of SQL generation;
# defaults to dbplyr::simulate_dbi() for the best guess at portable SQL.
#
# @return A dbplyr SQL object representing the fully qualified and escaped
# table name.
fqtn <-
function(table, catalog=NULL, schema=NULL, con=dbplyr::simulate_dbi())
{
if(is.null(table) || table == "")
stop("Bad table name")
if(!is.null(schema) && schema == "")
stop("Bad schema name")
if(!is.null(catalog) && catalog == "")
stop("Bad catalog name")
if(!is.null(catalog) && is.null(schema))
stop("Cannot give a catalog name without a schema")
#This kind of list manipulation in R is inefficient,
#but for three short character vectors, it doesn't matter.
dp <- list()
if(!is.null(catalog))
{
dp[[length(dp) + 1]] <- c(sql_escape_ident(catalog))
dp[[length(dp) + 1]] <- "."
}
if(!is.null(schema))
{
dp[[length(dp) + 1]] <- c(sql_escape_ident(schema))
dp[[length(dp) + 1]] <- "."
}
dp[[length(dp) + 1]] <- c(sql_escape_ident(table))
dp$con <- con
do.call(dbplyr::build_sql, dp)
}
#' Generate a SELECT statement from a model
#'
#' Generate a SELECT statement to score the passed model on a preexisting
#' database table. The statement will generate predictions entirely in the
#' database, with no need to fetch data into R. Models need not be GLMs, but
#' their prediction steps must consist of applying a response function to
#' a linear predictor.
#'
#' An open database connection can be passed as the `con` argument, or the
#' `dbplyr::simulate_*` functions can be used in applications which don't have
#' a DB connection when they need to generate SQL.
#'
#' @section Supported packages:
#' Specific packages and models that are known to work include: glm and lm from
#' package:stats, cv.glmnet from package:glmnet, glmboost from package:mboost,
#' and bayesglm from package:arm.
#'
#' Default S3 methods are for objects structured like those of class "glm", so
#' models not listed here may work if they resemble those objects, but are not
#' guaranteed to.
#'
#' @section Warning:
#' Note that if the model object transformed its training data before fitting (e.g.,
#' centering and scaling predictors), the generated SQL statement will not include
#' those transformations. A future release may include that functionality, but
#' centering and scaling in particular are difficult to do efficiently and portably
#' in SQL.
#'
#' @param mod A supported model object.
#' @param src_table The unqualified DB name of the source table.
#' @param src_schema The DB schema of the source table.
#' @param src_catalog The DB catalog of the source table.
#' @param pk A vector of primary key column names.
#' @param response The name of a custom response function to apply to the linear predictor.
#' @param con A DBI connection to control the details of SQL generation;
#' defaults to dbplyr::simulate_dbi() for the best guess at portable SQL.
#'
#' @return A dbplyr SQL object representing the SELECT statement.
#'
#' @examples
#' # Basic select statements
#' mod <- glm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
#' data=datasets::iris)
#' select_statement(mod, src_table="tbl_name")
#' select_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' src_catalog="catalog_name")
#' select_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' src_catalog="catalog_name", pk=c("lab", "specimen_id"))
#'
#' #With a custom response function
#' select_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' response="probit")
#'
#' # With a model-derived non-identity response function
#' mod <- glm(Sepal.Length > 5.0 ~ Sepal.Width + Petal.Length + Petal.Width + Species,
#' data=datasets::iris, family=binomial("logit"))
#' select_statement(mod, src_table="tbl_name")
#'
#' #With formula operators
#' x <- matrix(rnorm(100*20),100,20)
#' colnames(x) <- sapply(1:20, function(x) paste0("X", as.character(x)))
#' x <- as.data.frame(x)
#' mod <- glm(X2 ~ X3 + X5 + X15*X8, data=x)
#' select_statement(mod, src_table="tbl_name")
#' select_statement(mod, src_table="tbl_name", response="cauchit")
#'
#' @export select_statement
select_statement <-
function(mod, src_table, src_schema=NULL, src_catalog=NULL, pk=c("id"),
response=NULL, con=dbplyr::simulate_dbi())
{
#Fully qualify and escape the src table
src <- fqtn(src_table, src_catalog, src_schema, con=con)
#Put the statement together
parts <- list()
parts[[length(parts) + 1]] <- "SELECT "
for(i in seq_along(pk))
{
parts[[length(parts) + 1]] <- dbplyr::ident(pk[i])
parts[[length(parts) + 1]] <- ", "
}
se <- list(score_expression(mod, response=response), con=con)
parts[[length(parts) + 1]] <- do.call(dbplyr::translate_sql, se)
parts[[length(parts) + 1]] <- " FROM "
parts[[length(parts) + 1]] <- src
#We're leaving off the terminating semicolon to let people more easily
#tack on concluding incantations for the select (string munging is great)
parts$con <- con
do.call(dbplyr::build_sql, parts)
}
#' Generate a CREATE TABLE statement from a model
#'
#' Generate a CREATE TABLE statement to score the passed model on a preexisting
#' database table. The statement will generate predictions entirely in the
#' database, with no need to fetch data into R. Models need not be GLMs, but
#' their prediction steps must consist of applying a response function to
#' a linear predictor.
#'
#' An open database connection can be passed as the `con` argument, or the
#' `dbplyr::simulate_*` functions can be used in applications which don't have
#' a DB connection when they need to generate SQL.
#'
#' @section Supported packages:
#' Specific packages and models that are known to work include: glm and lm from
#' package:stats, cv.glmnet from package:glmnet, glmboost from package:mboost,
#' and bayesglm from package:arm.
#'
#' Default S3 methods are for objects structured like those of class "glm", so
#' models not listed here may work if they resemble those objects, but are not
#' guaranteed to.
#'
#' @section Warning:
#' Note that if the model object transformed its training data before fitting (e.g.,
#' centering and scaling predictors), the generated SQL statement will not include
#' those transformations. A future release may include that functionality, but
#' centering and scaling in particular are difficult to do efficiently and portably
#' in SQL.
#'
#' @param mod A supported model object.
#' @param src_table The unqualified DB name of the source table.
#' @param src_schema The DB schema of the source table.
#' @param src_catalog The DB catalog of the source table.
#' @param dest_table The unqualified DB name of the destination table.
#' @param dest_schema The DB schema of the destination table.
#' @param dest_catalog The DB catalog of the destination table.
#' @param drop Whether to generate a DROP TABLE IF EXISTS before the CREATE TABLE.
#' @param temporary Whether the destination table should be a temporary table.
#' @param pk A vector of primary key column names.
#' @param response The name of a custom response function to apply to the linear predictor.
#' @param con A DBI connection to control the details of SQL generation;
#' defaults to dbplyr::simulate_dbi() for the best guess at portable SQL.
#'
#' @return A dbplyr SQL object representing the SELECT statement.
#'
#' @examples
#' # Basic create statements
#' mod <- glm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
#' data=datasets::iris)
#' create_statement(mod, src_table="tbl_name", dest_table="target_tbl")
#' create_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' src_catalog="catalog_name", dest_table="target_tbl")
#' create_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' src_catalog="catalog_name", dest_table="target_tbl",
#' dest_schema="target_schema", dest_catalog="target_catalog",
#' pk=c("lab", "specimen_id"))
#'
#' #With a custom response function
#' create_statement(mod, src_table="tbl_name", src_schema="schema_name",
#' dest_table="target_tbl", response="probit")
#'
#' # With a model-derived non-identity response function
#' mod <- glm(Sepal.Length > 5.0 ~ Sepal.Width + Petal.Length + Petal.Width + Species,
#' data=datasets::iris, family=binomial("logit"))
#' create_statement(mod, src_table="tbl_name", dest_table="target_tbl")
#'
#' #With formula operators
#' x <- matrix(rnorm(100*20),100,20)
#' colnames(x) <- sapply(1:20, function(x) paste0("X", as.character(x)))
#' x <- as.data.frame(x)
#' mod <- glm(X2 ~ X3 + X5 + X15*X8, data=x)
#' create_statement(mod, src_table="tbl_name", dest_table="target_tbl")
#' create_statement(mod, src_table="tbl_name", dest_table="target_tbl",
#' response="cauchit")
#'
#' @export create_statement
create_statement <-
function(mod, dest_table, src_table,
dest_schema=NULL, dest_catalog=NULL, src_schema=NULL,
src_catalog=NULL, drop=FALSE, temporary=FALSE,
pk=c("id"), response=NULL, con=dbplyr::simulate_dbi())
{
# Ideally, we'd use some kind of object-relational mapper to build
# this statement rather than just munging text, but the ones available
# for R are underdeveloped. dbplyr comes closest but can't quite do this -
# in particular, it doesn't support CREATE TABLE AS SELECT
#Fully qualify and escape the dest table
dest <- fqtn(dest_table, dest_catalog, dest_schema, con=con)
#Put the statement together
parts <- list()
if(!is.null(drop) && drop)
parts <- c(parts, "DROP TABLE IF EXISTS ", dest, ";\n")
parts <- c(parts, "CREATE ",
ifelse(temporary, "TEMPORARY ", ""),
"TABLE ", dest, " AS ")
ss <- select_statement(mod, src_table=src_table, src_schema=src_schema,
src_catalog=src_catalog, pk=pk, response=response, con=con)
parts <- c(parts, ss)
#We're leaving off the terminating semicolon to let people more easily
#tack on concluding incantations for the select (string munging is great)
parts$con <- con
do.call(dbplyr::build_sql, parts)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.