vignettes/mdb.R

## ----knitr_init, echo=FALSE, cache=FALSE, warning=FALSE, message
options(knitr.table.format = "html")
options(max.print=100, scipen=999, width = 800)
knitr::opts_chunk$set(echo=FALSE,
	             cache=FALSE,
               prompt=FALSE,
	             eval = TRUE,
               tidy=TRUE,
               root.dir = "..",
               fig.height = 8,
               fig.width = 20,
               comment=NA,
               message=FALSE,
               warning=FALSE)
knitr::opts_knit$set(width=100, figr.prefix = T, figr.link = T)
knitr::knit_hooks$set(inline = function(x) {
  prettyNum(x, big.mark=",")
})

## ----load-libraries
library(dplyr)
library(extrafont)
library(ggplot2)

## ----source-functions
source("../R/analysis.R")
source("../R/associate.R")
source("../R/back.R")
source("../R/comparePredictions.R")
source("../R/conclusion.R")
source("../R/correlate.R")
source("../R/createDependent.R")
source("../R/forward.R")
source("../R/prediction.R")
source("../R/preprocess.R")
source("../R/process.R")
source("../R/regressionAnalysis.R")
source("../R/slr.R")
source("../R/univariate.R")
source("../R/univariateQual.R")
source("../R/univariateQuant.R")
source("../R/visualization.R")

## ----restore-data
fin1 <- openxlsx::read.xlsx(xlsxFile = "../data/sample.xlsx")
fin2 <- openxlsx::read.xlsx(xlsxFile = "../data/financials.xlsx")
save(fin1, file = "../data/fin1.Rdata")
save(fin2, file = "../data/fin2.Rdata")

## ----load-data
rerun <- TRUE
load("../data/movies.Rdata")
load("../data/fin1.Rdata")
load("../data/fin2.Rdata")

## ----preprocess
# dataSets  <- preprocess(movies, fin1, fin2)
# train <- dataSets$train
# test <- dataSets$test
# case <- dataSets$case
# mdb2 <- dataSets$mdb2
# save(train, file = "../data/train.RData")
# save(test, file = "../data/test.RData")
# save(case, file = "../data/case.RData")
# save(mdb2, file = "../data/mdb2.RData")
# rerun <- TRUE

## ----univariate
load("../data/train.Rdata")
load("../data/test.Rdata")
load("../data/case.Rdata")
load("../data/mdb2.Rdata")
edaUni <- univariate(mdb = train)

## ----models-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
models <- openxlsx::read.xlsx(xlsxFile = "../data/models.xlsx")
knitr::kable(models, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----fullModel
full <- openxlsx::read.xlsx("../data/features.xlsx")
full <- full  %>% filter(c == "yes" & Context == "Explanatory") %>% select(Type, Variable, Description) %>% arrange(Type, Variable)
knitr::kable(full, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")     

## ----model_a_build, results = "html
# Obtain Data
mData <- process(train, stage = "m", y = "imdb_num_votes_log")  

# Perform forward selection
m <- forward(data = mData$full, y = "imdb_num_votes_log")

# Conduct regression analysis
modelA <- regressionAnalysis(mod = m, mName = "Model Alpha", yVar  = 'imdb_num_votes_log',
                               yLab = "Log IMDB Votes")

# Report regression steps
knitr::kable(modelA$build, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelA_overview
knitr::kable(modelA$glance, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelA_regression
modelA$plots$regression

## ----modelA_linearity, fig.height=10------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
library(gridExtra)
n <- length(modelA$plots$linearity)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(modelA$plots$linearity, ncol=nCol))

## ----modelA_homoscedasticity
modelA$plots$res_fitted

## ----modelA_residuals
gridExtra::grid.arrange(modelA$plots$res_hist, modelA$plots$res_qq, ncol = 2)
                        


## ----modelA_multicollinearity
modelA$plots$multicollinearity$plot()

## ----modelA_vif
knitr::kable(modelA$tests$collinearity, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelA_outliers
gridExtra::grid.arrange(modelA$plots$res_leverage, modelA$plots$cooks, ncol = 2)

## ----model_b_build, results = "html
# Obtain Data
mData <- process(train, stage = "m", "imdb_num_votes_log", outliers = modelA$tests$influential)  

# Perform forward selection
m <- forward(mData$full, y = "imdb_num_votes_log")

# Conduct regression analysis
modelB <- regressionAnalysis(m, mName = "Model Beta", yVar  = 'imdb_num_votes_log',
                               yLab = "Log IMDB Votes")

# Report regression stesp
knitr::kable(modelB$build, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelB_overview----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
knitr::kable(modelB$glance, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelB_regression
modelB$plots$regression

## ----modelB_linearity, fig.height
n <- length(modelB$plots$linearity)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(modelB$plots$linearity, ncol=nCol))

## ----modelB_homoscedasticity
modelB$plots$res_fitted

## ----modelB_residuals
gridExtra::grid.arrange(modelB$plots$res_hist, modelB$plots$res_qq, ncol = 2)
                        


## ----modelB_multicollinearity
modelB$plots$multicollinearity$plot()

## ----modelB_vif
knitr::kable(modelB$tests$collinearity, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelB_outliers
gridExtra::grid.arrange(modelB$plots$res_leverage, modelB$plots$cooks, ncol = 2)

## ----model_c_build, results = "html
# Obtain Data
mData <- process(train, stage = "m", "imdb_num_votes_log")  

# Perform forward selection
m <- back(mData$full, y = "imdb_num_votes_log", alpha = 0.05)

# Conduct regression analysis
modelC <- regressionAnalysis(m, mName = "Model Gamma", yVar  = 'imdb_num_votes_log',
                               yLab = "Log IMDB Votes")

# Report regression stesp
knitr::kable(modelC$build, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelC_variables
vars <- openxlsx::read.xlsx(xlsxFile = "../data/features.xlsx")
vars <- vars %>% filter(Variable %in% m$selected) %>% select(Variable, Description)
knitr::kable(vars, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelC_overview
knitr::kable(modelC$glance, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelC_regression
modelC$plots$regression

## ----modelC_linearity, fig.height
n <- length(modelC$plots$linearity)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(modelC$plots$linearity, ncol=nCol))

## ----modelC_homoscedasticity
modelC$plots$res_fitted

## ----modelC_residuals
gridExtra::grid.arrange(modelC$plots$res_hist, modelC$plots$res_qq, ncol = 2)
                        


## ----modelC_multicollinearity
modelC$plots$multicollinearity$plot()

## ----modelC_vif---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
knitr::kable(modelC$tests$collinearity, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelC_outliers
gridExtra::grid.arrange(modelC$plots$res_leverage, modelC$plots$cooks, ncol = 2)

## ----model_d_build, results = "html
# Obtain Data
mData <- process(train, stage = "m", "imdb_num_votes_log", outliers = modelC$tests$influential)  

# Perform forward selection
m <- back(mData$full, y = "imdb_num_votes_log", alpha = 0.05)

# Conduct regression analysis
modelD <- regressionAnalysis(m, mName = "Model Delta", yVar  = 'imdb_num_votes_log',
                               yLab = "Log IMDB Votes")

# Report regression stesp
knitr::kable(modelD$build, digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelD_variables
vars <- openxlsx::read.xlsx(xlsxFile = "../data/features.xlsx")
vars <- vars %>% filter(Variable %in% m$selected) %>% select(Variable, Description)
knitr::kable(vars, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelD_overview
knitr::kable(modelD$glance, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelD_regression
modelD$plots$regression

## ----modelD_linearity, fig.height
n <- length(modelD$plots$linearity)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(modelD$plots$linearity, ncol=nCol))

## ----modelD_homoscedasticity
modelD$plots$res_fitted

## ----modelD_residuals
gridExtra::grid.arrange(modelD$plots$res_hist, modelD$plots$res_qq, ncol = 2)
                        


## ----modelD_multicollinearity
modelD$plots$multicollinearity$plot()

## ----modelD_vif
knitr::kable(modelD$tests$collinearity, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----modelD_outliers
gridExtra::grid.arrange(modelD$plots$res_leverage, modelD$plots$cooks, ncol = 2)

## ----model_comparison---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Compare models
abcd <- rbind(modelA$glance, modelB$glance, modelC$glance, modelD$glance)
knitr::kable(abcd, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")

## ----prediction_comparison
mods <- list(modelA, modelB, modelC, modelD)
accuracy <- comparePredictions(mods = mods, test = test)
knitr::kable(accuracy, digits = 3) %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = T, position = "center")
DataScienceSalon/mdb documentation built on May 28, 2019, 12:23 p.m.