withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Preface {-}

What is the Motivation Behind this Book? {-}

Books about R tend to focus on data analytics, graphics, statistics, machine learning, programming skills, and data science (whatever that means). The code and examples in these books are divorced from a design of real-world software systems. Understandably, the examples in these books are oversimplified to keep the focal point on the technique.

Very few books talk about the holistic approach for undertaking endeavours in R. In particular, two books discuss standard ways to build and deploy applications in R:

  1. R packages [@Wickham2015], provides a framework for building R packages; and
  2. Engineering Production-Grade Shiny Apps [@Fay2020], provides a framework and a methodology for building robust Shiny applications.

There is, however, a topic that neither of these books covers - building analytic applications. This book aims to fill this gap.

Who Should Read this Book? {-}

Every data science project has at least a project lead and a data scientist. Sometimes they are the same person. In any case, the first task facing the project lead is creating a template repository [@Microsoft2017]. Most practitioners use a former project, which is by itself a reincarnation of a former project, as the template repository. Regardless of what template repository is employed, its mechanics have to be communicated and thought with collaborators and your future self. Having a framework that is well documented and generalises well to a wide range of analytic applications means reducing everyone's cognitive effort and time spent teaching and learning different templates.

The core audience of this book is data science project lead who is seeking to adopt a framework for building analytic applications in R.

The book also aims to lighten the education of the project lead and team members by concentrating attention on a few essential analytic application components, the R startup process, and procedures of reproducibility. The numbers of the sections may be used as references in code review and induction of team members.

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Introduction {#intro}

Data scientists use R to develop analytic applications (or analytic apps for short), such as machine learning systems, dashboards and reports, to measure and improve the performance of a subject matter. If you are an experienced data scientist, then chances are you have already carried out several analytic projects that ended up with working analytic apps. Ask yourself, when the next analytic project commences, what would guide my development process? Am I able to outline my approach to others? Am I able to share my design principles (if any) with others?

Many data scientists either have faint or no answers to these questions. Nevertheless, they jump straight into coding the analytic app while skipping over its design. Perhaps in the absence of a repeatable approach, data scientists are looking to keep themselves busy by doing something they know, i.e. programming. However, busyness does not imply productivity. In fact, there is a hidden kind of danger in ignoring up-front design. The evolving nature of analytic projects needs a design that accommodates future changes driven by circumstances, clients needs and data.

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

(PART*) Part I: Background {-}

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

A Running Example {#example}

We demonstrate the knowledge in this book by emulating an analytic project that includes the development and deployment of a machine learning system. This chapter presents the background, requirements, and deliverables of the analytic project.

The example is based on an entry-level Kaggle competition for predicting house prices[^kaggle-competition]. Using the same dataset, we build an end-to-end machine learning system that solves a pseudo-real-life problem. The project includes two parts: training a prediction model on historical data, and making predictions on unseen data.

# Figure 1 - Predict the house sale price
knitr::include_graphics('./images/for-sale-ad.jpg', dpi = NA)

Originally, the house prices competition serves as a playground for data scientists to hone their skills. The goal of the competition is to predict sales prices for 1,459 houses. The competition features a dataset with 81 columns in which 73 are identifiable house attributes, termed amenities. Each participant submits, i.e., uploads to Kaggle, a table with 1,459 rows and two columns: Id and SalePrice. Kaggle evaluates the accuracy of each solution, based on RMSE, and ranks each participant on the leader-board in comparison to other competitors' submissions scores.

See full dataset description at the appendix

tables$report_salient_amenities()

To emulate a real-work analysis project, we transmute the competition setup to a business case setup. The deliverable of the business case is an automated valuation model (AVM). The AVM provides house prices predictions for other real estate agencies tools, such as a website or a real estate management system. Some major differences in needs between the original setting of the data science competition and the business case include:

Consider the following factors:

  1. The housing market changes over time; and
  2. The agency's database changes over time, e.g., new listings are added to the database.

Both of these factors require the analytic application to be re-runnable when the need arises. The first factor involves a trigger that periodically, say once a month, calls for a price update of all active listings. The second factor triggers a call when a new listing is to be added to the agency's database.

Similarly, to the competition, the solution is iterative.

[^kaggle-competition]: You can read more about it on Kaggle.

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Principles {#principles}

Introduction {-}

Encapsulation and Abstractions

Take a look at the following two main.R versions:

  1. Main application with low-level details
# main.R
## Load house prices data
temp_env <- new.env()
load(file = usethis::proj_path("data", "train_set", ext = "rda"), envir = temp_env)
data <- temp_env$train_set
rm(temp_env)

## Plot important amenities
par(mfrow = c(1,2))
plot(data$mpg , data$cyl, type = "p")
boxplot(mpg ~ cyl, data = data)
  1. Main application with high-level abstractions
# main.R
data <- load_house_prices_data()
plot_important_amenities(data)

Both code snippets have the same intent: they load the house prices dataset and provide plots for data exploration. Notice how much cognitive load the first snippet requires as the human brain compiles the code. The situation aggravates further if the reader is not familiar with the R syntax. In contrast, the second snippet hides the implementation details by wrapping the details in functions. The high-level abstractions communicate that there are two events happening in main.R: loading and plotting of data. As a result, the code is simpler to read and understand.

load_mtcars_data <- function(){
    mtcars <- datasets::mtcars
    return(mtcars)
}

Furthermore, the second snippet is easier to maintain and develop. These qualities are desirable in any software application. This is because software systems evolve as programmers acquire new knowledge and understanding of the problem the software is set to solve. Importantly, analytic applications are the result of scattershot and serendipitous explorations. As data scientist discover new findings and signals, they incorporate them in the analytic application. For example, plot_important_attributes original implementations is:

plot_important_attributes <- function(data){
    par(mfrow = c(1,2))
    plot(data$mpg , data$cyl, type = "p")
    boxplot(mpg ~ cyl, data = data)
}

Imagine a data scientist discovers, whether by client feedback or other mean, that there is another important attribute to include in the data analysis. Moreover, to reduce confusion, the data scientist decides to modify the plots aesthetics such that they contain titles. Then, plot_important_attributes mutates to:

plot_important_attributes <- function(data){
    par(mfrow = c(1,3))
    plot(data$mpg , data$hp, type = "p", main = "MPG ~ Horsepower")
    plot(data$mpg , data$cyl, type = "p", main = "MPG ~ Cylinders")
    boxplot(mpg ~ cyl, data = data, main = "MPG ~ Cylinders")
}

With encapsulation, the data scientist was able to modify and extend the rendered plots without making any changes in main.R.

The Dependency Inversion Principle

Layering

Conclusion {-}

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Data Sources {#data-sources}

Introduction

Any analytic project needs access to data. The location (electronic address) or mechanism from which data originates or can be obtained is called a data source. This definition suggests data sources vary in their origins and their storage mechanisms. First, data may originate organically, from measuring instruments that record real-data or generated syntactically to fit a specific purpose. Second, data may be stored inside a database, a flat-file, a website, or an API.

The choice of what data source to use in our analytic application is taken with regards to two major qualities: accessibility and usefulness. Given an existing data source, accessibility is the degree of how easy it is to obtain (access to) it. Accessibility is mainly impacted by company policy (regulation, security protocols, etc.) and the availability and willingness of people we are dependent on, mainly database administrators and data engineers, to respond to our data access requests. Usefulness is the degree of having practical worth or applicability. Usefulness is determined by circumstances combined with the goal we are trying to achieve.

Among the two data source qualities: accessibility and usefulness, the latter should lead our choice of a data source. The following sections describe common goals and circumstances of analytic projects and propose appropriate data sources.

Data Source Usefulness

In chapter 1, we decomposed the process of building analytic applications into two realms: software development and data analytics. Following that decomposition, we argue that having two data sources, one for each realm is appropriate for many analytic projects. We begin by identifying the role of data in each realm. Then, we infer what are the desirable or necessary attributes of the data source in each realm. Finally, we demonstrate the proposed data sources in our running example.

We start with data analytics as it requires the least explanations. Data analytics is the science of analyzing data in order to make conclusions about that information, i.e. actionable insights. Examples of data analytics activities include data exploration and (predictive) model selection. Obviously, when modelling the real-world, there is no substitute for real data. Nevertheless, a representative sample of the data is sufficient and even preferable in many analytic projects.

In our context, a representative sample is a subset of a data source that seeks to accurately reflect the data source. Representative samples tend to contain a subspace of the domain or a snapshot of the domain at a given time. That means that if we turn the representative sample into actionable insights, then adding more spatial/temporal data to the representative sample would give similar insights.

Representative samples can be stored in flat files of modest sizes. The file size attribute is desirable. Loading gigabytes of information every time an R session begins is both slow and memory-intensive action. Moreover, applying transformation on the loaded data may result in copies that would consume more memory.


The following code demonstrates a syntactic data source implementation for the running example. Notice, that the syntactic dataset captures our current knowledge of the house prices domain. It has an identifier column (Id), salient features (Bedrooms and Bathrooms) that must exist in the real application, and a target variable (SalePrice). The content of the dataset holds true information to the extent necessary for software development activities. For example, as you would expect, SalePrice is a numeric variable with non-negative values. However, there is no attempt (at this stage) to fake values resembling prices in the real housing market.

generate_synthetic_data <- function(n){
    dummy_id <- function() stringr::str_pad(1:n, width = 6, pad = 0)
    dummy_numeric <- function() runif(n = n, min = 0, max = 1)
    dummy_integer <- function() rpois(n = n, lambda = 3)

    train_set <- tibble::tibble(
        Id        = dummy_id(),
        Bedrooms  = dummy_integer(),
        Bathrooms = dummy_integer(),
        SalePrice = dummy_numeric()
    )    
}

set.seed(1356)
generate_synthetic_data(n = 100) %>% head()

Data Source Accessibility

The Common Case

Conclusions

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

(PART*) Part II: Design {-}

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Domain Modelling {#domain-model}

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

Data Access Object {#data-access-object}

Import

The section uses the following packages:

pkgs_names <- sort(c("DBI", "dbplyr", "RSQLite"))
install_string <- bookdown$print_install.packages_command(pkgs_names)
bookdown$print_package_info_table(pkgs_names)

You can install them all at once by running:


There are three important functions:

dbConnect <- DBI::dbConnect
dbWriteTable <- DBI::dbWriteTable
dbDisconnect <- DBI::dbDisconnect

Using generate_synthetic_data from chapter 4

conn <- dbConnect(drv = RSQLite::SQLite(), path = ":memory:")
dbWriteTable(conn, name = "train_set", value = generate_synthetic_data(n = 100))
dbWriteTable(conn, name = "test_set", value = generate_synthetic_data(n = 1000))
withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

(APPENDIX) Appendices {-}

withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

House Prices Data Description {#appendix-code-book}

code_book <- dplyr::tbl(HousePricesData$new()$con, "code_book")
cat(stringr::str_c(code_book %>% dplyr::pull()), sep = "\n")
withr::with_dir(usethis::proj_get(), pkgload::load_all(helpers = TRUE, export_all = TRUE))

# global options ----------------------------------------------------------
options(tidyverse.quiet = TRUE)

# bookdown ----------------------------------------------------------------
options(tinytex.verbose = TRUE)
options(bookdown.post.latex = function(lines){
    rep_pos = grep('\\definecolor{shadecolor}{RGB}{248,248,248}',lines, fixed = TRUE)
    lines[rep_pos] = '\\definecolor{shadecolor}{RGB}{230,230,230}'
    lines
})

# knitr -------------------------------------------------------------------
knitr::opts_chunk$set(
    out.width = '100%',
    echo = FALSE,
    results = "markup",
    message = FALSE,
    warning = FALSE,
    cache = TRUE,
    comment = "#>",
    fig.retina = 0.8, # figures are either vectors or 300 dpi diagrams
    dpi = 300,
    out.width = "70%",
    fig.align = 'center',
    fig.width = 6,
    fig.asp = 0.618,  # 1 / phi
    fig.show = "hold",
    eval.after = 'fig.cap' # so captions can use link to demos
)

# index.R metadata --------------------------------------------------------
index <- new.env() 
DESCRIPTION <- desc::description$new(usethis::proj_get())
index$title       <- function() DESCRIPTION$get_field('Title')
index$subtitle    <- function() DESCRIPTION$get_field('Subtitle')
index$description <- function() DESCRIPTION$get_field('Description')
index$author      <- function() paste(unlist(DESCRIPTION$get_author())[c('given', 'family')], collapse = ' ')
index$date        <- base::Sys.Date
index$url         <- function() DESCRIPTION$get_urls()
index$cover_image <- function() NULL # "images/cover.png" 
index$favicon     <- function() "favicon.ico"
index$github_repo <- function() "Kiwi-Random-House/R-Projects"

`r if (knitr::is_html_output()) '

References {-}

'`



Kiwi-Random-House/R-Projects documentation built on Dec. 31, 2020, 2:10 p.m.