library(flexdashboard)
library(shiny)
library(tidyverse)
library(stringr)
library(ggthemes)
source('../model.R')
## Load & Score Test Data Set 
new_data <- read.table(
  cloudml::gs_data("gs://rstudio-cloudml-demo-ml/census/data/adult.test"),
  col.names = CSV_COLUMNS,
  header = FALSE,
  sep = ",",
  stringsAsFactors = FALSE
) 

# Clean Up Data
new_data$fnlwgt <- NULL
label <- new_data[[LABEL_COLUMN]]
new_data[[LABEL_COLUMN]] <- NULL

# generate predictions
predictions <- cloudml::local_predict("../jobs/local", new_data)

# flatten predictions
new_data[[LABEL_COLUMN]] <- label
new_data$score <- predictions$predictions %>% map_dbl(~ .x$probabilities[2])

Row {.sidebar}

Predict whether income exceeds \$50K/yr based on census data. Data extraction was done by Barry Becker from the 1994 Census database. Prediction task is to determine whether a person makes over 50K a year. See the data source and description for more information.


sliderInput("cutoff", "Probability Cutoff", min = 0, max = 1, value = 0.5)

Row {data-height=150}

Accuracy

valueBoxOutput("accuracy")

Confusion Matrix

tableOutput("conf")
labelled_data <- reactive({
  new_data$Label <- ifelse(new_data$score > input$cutoff, " >50K.", " <=50K.")
  new_data
})

output$accuracy <- renderValueBox({
  accuracy <- paste0(round(sum(labelled_data()$Label == labelled_data()$income_bracket) / nrow(labelled_data()), 2)*100, "%")
  valueBox(accuracy, caption = "accuracy", color = "primary", icon = "fa-check-circle")
})

output$conf <- renderTable(rownames = TRUE, digits = 0, {
  conf_matrix <- matrix(data = rep(0,4), nrow = 2, ncol = 2)
  conf_matrix[1,1] <- sum(labelled_data()$Label == " <=50K." & 
                          labelled_data()$income_bracket == " <=50K." )
  conf_matrix[1,2] <- sum(labelled_data()$Label == " >50K." & 
                          labelled_data()$income_bracket == " <=50K." )
  conf_matrix[2,1] <- sum(labelled_data()$Label == " <=50K." & 
                          labelled_data()$income_bracket == " >50K." )
  conf_matrix[2,2] <- sum(labelled_data()$Label == " >50K." & 
                          labelled_data()$income_bracket == " >50K." )
  colnames(conf_matrix) <- c("Predicted <=50K", ">50K")
  rownames(conf_matrix) <- c("Actual <=50K", ">50K")
  conf_matrix
})

Row {.tabset}

Gender

plotOutput("gender")
output$gender <- renderPlot({
  plotDiscrete("gender")
})

Age

plotOutput("age")
output$age <- renderPlot({
  plotContinuous("age")
})

Relationship

plotOutput("relationship")
output$relationship <- renderPlot({
  plotDiscrete("relationship")
})

Marital Status

plotOutput("marital_status")
output$marital_status <- renderPlot({
  plotDiscrete("marital_status")
})

Race

plotOutput("race")
output$race <- renderPlot({
  plotDiscrete("race")
})

Native Country

plotOutput("native_country")
output$native_country <- renderPlot({
  plotDiscrete("native_country")
})

Hours Per Week

plotOutput("hours_per_week")
output$hours_per_week <- renderPlot({
  plotContinuous("hours_per_week")
})

Occupation

plotOutput("occupation")
output$occupation <- renderPlot({
  plotDiscrete("occupation")
})

Occupation Class

plotOutput("workclass")
output$workclass <- renderPlot({
  plotDiscrete("workclass")
})

Education - Degree

plotOutput("education")
output$education <- renderPlot({
  plotDiscrete("education")
})

Education - Years

plotOutput("education_num")
output$education_num <- renderPlot({
  plotContinuous("education_num")
})
plotContinuous <- function(variable = "hours_per_week") {

  lab <- str_replace_all(variable, "_", " ") %>% str_to_title()
  ggplot(labelled_data()) +
    geom_density(aes_string(x = variable)) + 
    labs(
      title = lab,
      fill = ""
    ) +
    scale_y_continuous(label = function(x){paste0(x*100, "%")}) +
    theme_fivethirtyeight() +
    scale_fill_fivethirtyeight() +
    facet_wrap(~Label)

}


plotDiscrete <- function(variable = "gender") {
 lab <- str_replace_all(variable, "_", " ") %>% str_to_title()

  labelled_data() %>% 
    mutate_(var = variable) %>% 
    group_by(Label, var) %>%
    summarise(n = n()) %>%
    mutate(prop = n / sum(n)) %>% 
    ggplot() +
    geom_col(aes(x = reorder(var, prop), y = prop),
      position = 'dodge') +
    labs(
      title = lab,
      fill = ""
    ) +
    scale_y_continuous(label = function(x){paste0(x*100, "%")}) +
    theme_fivethirtyeight() +
    theme(axis.text.x = element_text(angle = 90, size = 14)) +
    scale_fill_fivethirtyeight() +
    facet_wrap(~Label)
}


rstudio/cloudml documentation built on Aug. 12, 2020, 8:22 p.m.