library(skimr)
require(tidyverse)
require(modeltime)
require(tidymodels)
library(workflowsets)
require(timetk)
require(xgboost)
require(caret)
library(lubridate)
library(devtools)
library(plotly)
library(openxlsx)
library(writexl)
require(randomForest)
require(shiny)
require(dendextend)
size = 2
# About App tab ---------------------------------------------------------------
Tab1 <- tabPanel("About the app", fluid = T,
sidebarLayout(position = 'left',
sidebarPanel(h3('Introduction'), width = size),
mainPanel(
h3('Welcome to the jungle'),
br(),
p('This application was developed in order to help you and your team the
process of selecting adequate predictors for your MMM model. This process
is thought to be an insight-recomendation generator for your linear model
and improve the understanding of your independent predictors.', style = "font-size:18px"),
br(),
p(strong('Run Forest Run'), 'is an useful graphic interface for creating
modelling rules, to facilitate the understanding of your newest model, but
it actually excels at comparing the updates vs the old model. It gives a
sense of entropy between the variables measured before the previous threshold
and the new data points.', style = "font-size:18px"),
br(),
p('To continue using the app please make sure that both of your datasets:', style = "font-size:18px"),
HTML(
'<ul>
<li style = "font-size:18px"; >Are Excel files (.xlsx).</li>
<li style = "font-size:18px"; >Does not contain any missing values.</li>
<li style = "font-size:18px"; >Its first row contains the variable names.</li>
<li style = "font-size:18px"; >Each of its columns is populated from the second row on.</li>
<li style = "font-size:18px"; >Contains a Date column that is in any date format.</li>
<li style = "font-size:18px"; >Both of your datasets have the exact same names.</li>
</ul>'),
h3('Example:'),
br(),
img(src='R\a.PNG',height=600,width=900),
br(),
br(),
br(),
p('If you have questions please contact David Carranza (Juan.Carranza@kinesso.com)', style="font-size:18px"),
p(uiOutput('github'), style="font-size:18px")
)))
# Upload tab --------------------------------------------------------------
Tab2 <- tabPanel("Upload Dataset", fluid = T,
sidebarLayout(position = 'left',
sidebarPanel(
br(),
h4("1. Choose your excel file"),
fileInput('file', 'Choose Excel File', accept = c('.xlsx')),
hr(),
h4('2. Define your segment'),
p('To continue, please select the variable name that defines your segments.'),
selectInput('cat','Segment Variable', '', selected=''),
p('To continue, please select a segment.'),
htmlOutput("variableUI"),
hr(),
h4("3. Select a new database for contrasting (optional)"),
fileInput('file_new', 'Choose New File', accept = c('.xlsx')),
width = size),
mainPanel(
br(),
h3('Reviewing your Dataset'),
br(),
dataTableOutput('glimpse'),
br(),
h3('A brief summary'),
br(),
dataTableOutput('summary'),
br(),
h3('Reviewing the new Dataset'),
br(),
dataTableOutput('glimpse_new'),
br(),
h3('This variables will not appear in your model'),
p("This happens because these variables are not present in one
of the datasets, it could be that in the new data we don't have
all the variables used in the last model, or because there are new
variables in this update. For clarification, please visit the",
strong('Data Review App'), style="font-size:18px"),
br(),
dataTableOutput('missing')
)))
# XGBoost tab -------------------------------------------------------
Tab3 <- tabPanel("XGBoost", fluid = T,
sidebarLayout(position = 'left',
sidebarPanel(
br(),
sliderInput(inputId = "seed_xg",
label = "Model and Grid Seed:",
min = 200,
max = 300,
val = 250),
sliderInput(inputId = "train_threshold_xg",
label = "Train Proportion:",
min = 0.7,
max = 0.9,
val = 0.8),
selectInput(inputId = "KPI_xg",
label = "Select your XG KPI",
"",
selected = ""),
selectInput(inputId = "drop_xg",
label = "Drop variables from the XG model:",
"",
multiple = T),
br(),
actionButton("update_drop_xg", "Run the XG Model"),
p('Once you have checked the modifiable parameters, press the button to
start the model'),
width = size),
mainPanel(
br(),
h3('Extreme Gradient Boost Model'),
p('This non-linear model is a set of decision trees. In particular, this model
starts with a base predictor (or base tree) and adds trees sequentially, i.e.
it adds one tree after another in order to adjust the errors of the predictor
model. The models are fitted by using any arbitrary differentiable loss function and
using the gradient boosting algorithm (hence the name', strong('gradient boosting'),').
Very similar to the optimisation process performed in neural networks.',
style="font-size:18px"),
br(),
p('The use of this application consists of evaluating the number of models defined
by the user (we strongly recommend a maximum of', strong('6 models'),'), giving as input the
dependent variable of interest and this will be adjusted against the other predictors.
The predictors may sometimes have correlations with each other, so the',
strong('correlation threshold slider'), 'defines the maximum tolerable linear
relationship between the predictors in the model. It is also possible to remove
variables that may generate noise, or that the user simply does not want. Once
these parameters have been configured, press the', strong("Run the model"),
'button to run the workflow.', style="font-size:18px"),
splitLayout(
sliderInput(inputId = "n_models_xg",
label = "Select number of models to test",
min = 2,
max = 8,
val = 4),
sliderInput(inputId = "thresh_corr_xg",
label = "Select correlation threshold",
min = 0.4,
max = 1,
val = 0.7)),
br(),
h3("Model Results"),
p('The', strong('Actual vs Predicted'), 'plot shows you the model result in contrast
with the real KPI variable, while the', strong('Variable Importance'), 'plot shows
the percentage of appearance of any variable. In other terms, in how many trees the
variables appear. So, if a variable has a huge impact in your model, it may be on top
of the chart.', style="font-size:18px"),
p(strong('Warning:'), 'In some cases, the', strong('Variable Importance'), 'plot may
not display, for solving this issue, slide the number of models once. If this error
persists, feel free to call me. This happens because the model does not converge, so
updating the parameters will solve this issue.', style="font-size:18px"),
plotlyOutput("xg_importances"),
plotlyOutput("xg_predictions"),
br(),
p('The model metrics and the error metrics are presented in the next two datatables.
The values are the final statistics. We usally use the', strong('MAPE'), 'the ',
strong("R-Squared"), '(presented in the model metrics table as "rsq") and most
recently the', strong('Durbin-Watson.'), style="font-size:18px"),
h3("Model Metrics"),
br(),
dataTableOutput("xg_m_metrics"),
br(),
h3("Error-Based Metrics"),
br(),
dataTableOutput("xg_e_metrics"),
br(),
h3("Best Model Configuration"),
p('This is the best performing-model. The parameters are explained below:',
style="font-size:18px"),
HTML(
'<ul>
<li style = "font-size:18px"; >Trees refers to the number of trees used for fitting the model.</li>
<li style = "font-size:18px"; >Min_n is the minimum number of observations taken for each model.</li>
<li style = "font-size:18px"; >Tree Depth is the max number of descending nodes for each model.</li>
<li style = "font-size:18px"; >Sample Size is the percentage of observations exposed for each model.</li>
<li style = "font-size:18px"; >Note: The number of features
taken for each model is equal to the square root of total features.</li>
</ul>'),
br(),
dataTableOutput("xg_configuration"),
br(),
h3('Out-Of-Sample predictions'),
br(),
p("In this section you can see the results of training the model with original data
and predicting the outcome with new data that the model hasn't seen. This is quite useful
because when we update the models, we could have a lot of problems with the new data points
and may waste a lot more of time. This can give you a better perspective about the new
information given by the client. If the model", strong("crashes"), "(the previous statistics
deteriorates a lot) the new data points could suggest a change in structure of the model,
suggesting that it may be necessary remodelling; if not, it could just be a routine update.
The blue vertical line shows the moment where the new data is taken",style="font-size:18px"),
br(),
plotlyOutput("xg_predictions_new"),
br(),
fluidRow(
column(
h3('Model Metrics'),
br(),
dataTableOutput("xg_new_m_metrics"),
br(),
width = 6,
),
column(
h3('Error Metrics'),
br(),
dataTableOutput("xg_new_e_metrics"),
br(),
width = 6
)
),
br(),
h3('Download your results'),
p('Are you satisfied with the model? Download it!', style="font-size:18px"),
downloadButton("download_xg_results", "Download Results")
)))
# Random Forest tab -------------------------------------------------------
Tab4 <- tabPanel("Random Forest", fluid = T,
sidebarLayout(position = 'left',
sidebarPanel(
br(),
sliderInput(inputId = "seed",
label = "Model and Grid Seed:",
min = 200,
max = 300,
val = 250),
sliderInput(inputId = "train_threshold",
label = "Train Proportion:",
min = 0.7,
max = 0.9,
val = 0.8),
selectInput(inputId = "KPI",
label = "Select your KPI",
"",
selected = ""),
selectInput(inputId = "drop",
label = "Drop variables from the model:",
"",
multiple = T),
br(),
actionButton("update_drop_rf", "Run the RF Model"),
p('Once you have checked the modifiable parameters, press the button to
start the model'),
width = size),
mainPanel(
br(),
h3('Random Forest Model'),
p("This non-linear model is a set of decision trees. In contrast to the previous model, this
model does not use the decision trees sequentially, but each of the trees operates independently.
Each gives an individual outcome and the model's outcome is then the top-ranked outcome
across all regression trees. By being independent, each model protects the other from
individual errors, improving aggregate performance.", style="font-size:18px"),
br(),
p('The use of this application consists of evaluating the number of models defined
by the user (we strongly recommend a maximum of', strong('8 models'),'), giving as input the
dependent variable of interest and this will be adjusted against the other predictors.
The predictors may sometimes have correlations with each other, so the',
strong('correlation threshold slider'), 'defines the maximum tolerable linear
relationship between the predictors in the model. It is also possible to remove
variables that may generate noise, or that the user simply does not want. Once
these parameters have been configured, press the', strong("Run the model"),
'button to run the workflow.', style="font-size:18px"),
br(),
splitLayout(
sliderInput(inputId = "n_models_rf",
label = "Select number of models to test",
min = 2,
max = 8,
val = 4),
sliderInput(inputId = "thresh_corr",
label = "Select correlation threshold",
min = 0.4,
max = 1,
val = 0.7)),
br(),
h3("Model Results"),
p('The', strong('Actual vs Predicted'), 'plot shows you the model result in contrast
with the real KPI variable, while the', strong('Variable Importance'), 'plot shows
the total number of appearance of any variable. In other terms, in how many trees the
variables appear, normalized into (0,1). So, if a variable has a huge impact in your model,
it may be on top of the chart. Like the previous model.', style="font-size:18px"),
br(),
plotlyOutput("rf_importances"),
plotlyOutput("rf_predictions"),
br(),
p('The model metrics and the error metrics are presented in the next two datatables.
The values are the final statistics. We usally use the', strong('MAPE'), 'the ',
strong("R-Squared"), '(presented in the model metrics table as "rsq") and most
recently the', strong('Durbin-Watson.'), style="font-size:18px"),
br(),
h3("Model Metrics"),
br(),
dataTableOutput("rf_m_metrics"),
br(),
h3("Error-Based Metrics"),
br(),
dataTableOutput("rf_e_metrics"),
br(),
h4("Best Model Configuration"),
p('This is the best performing-model. The parameters are explained below:',
style="font-size:18px"),
HTML(
'<ul>
<li style = "font-size:18px"; >Trees refers to the number of trees used for fitting the model.</li>
<li style = "font-size:18px"; >Min_n is the minimum number of observations taken for each model.</li>
</ul>'),
br(),
dataTableOutput("rf_configuration"),
br(),
h3('Out-Of-Sample predictions'),
br(),
p("In this section you can see the results of training the model with original data
and predicting the outcome with new data that the model hasn't seen. This is quite useful
because when we update the models, we could have a lot of problems with the new data points
and may waste a lot more of time. This can give you a better perspective about the new
information given by the client. If the model", strong("crashes"), "(the previous statistics
deteriorates a lot) the new data points could suggest a change in structure of the model,
suggesting that it may be necessary remodelling; if not, it could just be a routine update.
The blue vertical line shows the moment where the new data is taken",style="font-size:18px"),
br(),
plotlyOutput("rf_predictions_new"),
br(),
fluidRow(
column(
h3('Model Metrics'),
br(),
dataTableOutput("rf_new_m_metrics"),
br(),
width = 6,
),
column(
h3('Error-Based Metrics'),
br(),
dataTableOutput("rf_new_e_metrics"),
br(),
width = 6
)
),
br(),
h3('Download your results'),
p('Are you satisfied with the model? Download it!', style="font-size:18px"),
downloadButton("download_rf_results", "Download Results")
)))
Tab5 <- tabPanel("Hierarchy Trees", fluid = T,
sidebarLayout(position = "left",
sidebarPanel(
br(),
selectInput(inputId = "drop_ht",
label = "Drop variables for comparison:",
"",
multiple = T),
p("Drop the variables you don't want to see in the comparison tree.
You can drop as much as you want, the result is immediate."),
width = size),
mainPanel(
h3('Hierarchical Clustering'),
br(),
p('The idea behind this particular model, is the usage of some distance metric
for capturing the simmilarity between data points, in this case, the similarity
between variables. In this case, the technique is agglomerative, so each variable
is a cluster and the minor distance is represented by how close the clusters are
in the brances.',
style="font-size:18px"),
br(),
p("This app encourages you to use the methodology in order to detect in a visual
way the relations between variables (not only linear) and capture if some variables
are more or less related to each other. You can choose in the left panel the variables
you don't want to compare in the clustering; also you can choose the distance metric
used and the method for clusterization",
style="font-size:18px"),
br(),
h3('Select the parameters for your cluster'),
br(),
h4('Distances'),
br(),
HTML(
'<ul>
<li style = "font-size:15px"; >Euclidean distance is the ordinary distance between
two points in the plane, taking the sum of the difference
between the components.</li>
<li style = "font-size:15px"; >Maximum is the maximum distance beetween the components
of the points compared (supremum norm).</li>
<li style = "font-size:15px"; >Manhattan is the Absolute distance between the two vectors.</li>
<li style = "font-size:15px"; >Canberra is the absolute euclidian distance divided into the
sum of the components.</li>
<li style = "font-size:15px"; >Minkowski distance is the generalisation of the euclidean
and manhattan distance. It takes the absolute difference
between components to the p exponent depending on iterations.</li>
</ul>'),
br(),
h4('Clustering'),
br(),
HTML(
'<ul>
<li style = "font-size:15px"; >Ward uses the minimum variance method, in order to create links
that minimizes the intra-cluster variance.</li>
<li style = "font-size:15px"; >Single computes all pairwise dissimilarities and considers the smallest
one as a linkage criterion.</li>
<li style = "font-size:15px"; >Complete compares element to element to identify the maximum distance beetween
components and identify the minimum of these to create linkages.</li>
<li style = "font-size:15px"; >Average uses the mean of the pairwise dissimilarities between components
to create linkages, Median does the same operation.</li>
<li style = "font-size:15px"; >Centroids uses the average of a vector X to compare to the average of another
vector Y (of the same length) and creates linkages based on the minimum difference
of both.</li>
</ul>'),
br(),
splitLayout(
selectInput(inputId = "distances_1",
label = "Select the method for calculating distances.",
choices = c(
"Euclidean" = "euclidean",
"Maximum" = "maximum",
"Manhattan" = "manhattan",
"Canberra" = "canberra",
"Minkowski" = "minkowsky"),
selected = "euclidean"),
selectInput(inputId = "cluster_1",
label = "Select the method for clustering.",
choices = c(
"Ward" = "ward.D",
"Single" = "single",
"Complete" = "complete",
"Average" = "average",
"Median" = "median",
"Centroids" = "centroid"),
selected = "average")),
br(),
tags$head(tags$style(HTML(".shiny-split-layout > div {overflow: visible;}"))),
plotOutput('single_dendo', height = "900px"),
br(),
h3('Want to compare to another clustering type?'),
p('Sometimes you need to have a pair revision. In this case, you can compare
your first hierarchical cluster with another one and check if the variables
belong to the same end nodes. These can be compared with the inner links that
match the variables. The usage is the same, select the distance and the cluster
type',
style="font-size:18px"),
br(),
splitLayout(
selectInput(inputId = "distances_2",
label = "Select the method for calculating distances.",
choices = c(
"Euclidean" = "euclidean",
"Maximum" = "maximum",
"Manhattan" = "manhattan",
"Canberra" = "canberra",
"Binary" = "binary",
"Minkowski" = "minkowsky"),
selected = "euclidean"),
selectInput(inputId = "cluster_2",
label = "Select the method for clustering.",
choices = c(
"Ward" = "ward.D",
"Single" = "single",
"Complete" = "complete",
"Average" = "average",
"McQuitty" = "mcquitty",
"Median" = "median",
"Centroids" = "centroid"),
selected = "complete")),
br(),
plotOutput("tanglegram", height = "900px")
)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.