Data from Department of Health and Mental Hygiene (DOHMH) - New York City Restaurant Inspection Results, and the link is here: https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j.

knitr::opts_chunk$set(echo = TRUE)

Clean up the Data

library(readr)
library(dplyr)
library(stringr)

df <- read.csv('raw_data/coffee_compare.csv')
coffee <- df %>% select(DBA, reinspections, checks, violations, 
                        score, inspections, BORO, SCORE)
coffee$DBA = ifelse(str_detect(coffee$DBA, "DUNKIN"), 'DD', 'Starbucks')
coffee %>% 
    select(BORO, DBA) %>% 
    group_by(BORO) %>% 
    table()

Visualization

library(ggthemes)
library(ggplot2)
library(plotly)

coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(Value = n())

pc <- ggplot(coffee_new, aes(fill=DBA, y=Value, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc
pc1 <- ggplot(coffee, aes(x = BORO, y = violations, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'DOHMH') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
pc1
coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_vio = sum(violations, na.rm = T))

pc11 <- ggplot(coffee_new, 
               aes(fill=DBA, y=total_vio, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc11
pc2 <- ggplot(coffee, aes(x = BORO, y = SCORE, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Total Score for a Particular Inspection') + 
    labs(caption = 'DOHMH') +
    ggtitle('Score of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
pc2
coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_score = sum(SCORE, na.rm = T))

pc11 <- ggplot(coffee_new, 
               aes(fill=DBA, y=total_score, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Total Score for a Particular Inspection') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Inspection Score of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc11

Linear Regression Analysis

library(jtools)
coffee_lm <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_num = n(),
              total_vio = sum(violations, na.rm = T),
              total_score = sum(SCORE, na.rm = T))
coffee_lm$DBA = ifelse(coffee_lm$DBA == "DD", 1, 0)

coffee_sb <- coffee_lm %>% filter(DBA == 1)
coffee_dd <- coffee_lm %>% filter(DBA == 0)
lm <- lm(total_num ~ total_vio + total_score,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(total_num ~ total_vio + total_score,
         data = coffee_dd)
summary(lm)
summ(lm)
lm1 <- lm(total_vio ~ total_num,
         data = coffee_sb)
summary(lm1)
summ(lm)
lm1 <- lm(total_vio ~ total_num,
         data = coffee_dd)
summary(lm1)
summ(lm1)
lm2 <- lm(total_vio ~ total_num*DBA,
         data = coffee_lm)
summary(lm2)
summ(lm2)
lm2 <- lm(total_score ~ total_num*DBA,
         data = coffee_lm)
summary(lm2)
summ(lm2)

Supervised Machine Learning

Question: Whether can be told the coffee store is Starbucks or not?

Binary outcome in this case.

library(caret)

coffee$DBA = ifelse(coffee$DBA == "DD", 0, 1)
coffee$DBA <- factor(coffee$DBA, 
                     labels = c("Starbucks", "DD"), 
                     levels = 1:0) 
set.seed(12345)
in_train <- createDataPartition(y = coffee$DBA, 
                                p = 0.8, list = FALSE)
training <- coffee[ in_train, ]
testing  <- coffee[-in_train, ]

logit

logit <- glm(DBA ~ checks + violations + score + BORO, 
             data = training, family = binomial(link = "logit"))

y_hat_logit <- predict(logit, newdata = testing, type = "response")
z_logit <- factor(y_hat_logit > 0.5, 
                  levels = c(TRUE, FALSE), 
                  labels = c("Starbucks", "DD"))

confusionMatrix(z_logit, reference = testing$DBA)

LDA

LDA <- train(DBA ~ checks + violations + score + BORO, 
             data = training, method = "lda", 
             reProcess = c("center", "scale"))
z <- predict(LDA, newdata = testing)
confusionMatrix(z, testing$DBA)

QDA

QDA <- train(DBA ~ checks + violations + score + BORO, 
             data = training, method = "qda", 
             preProcess = c("center", "scale"))

z_QDA <- predict(QDA, newdata = testing)

confusionMatrix(z_QDA, reference = testing$DBA)

PLSDA

ctrl_PLSDA <- trainControl(method = "repeatedcv", repeats = 2, 
                     classProbs = TRUE, summaryFunction = twoClassSummary)
PLSDA_grid <- expand.grid(.ncomp = 1:2)

PLSDA <- train(DBA ~ checks + violations + score + BORO, 
               data = training, method = "pls", 
               preProcess = c("center", "scale"), metric = "ROC", 
               trControl = ctrl_PLSDA, tuneGrid = PLSDA_grid)

z_PLSDA <- predict(PLSDA, newdata = testing)

confusionMatrix(z_PLSDA, reference = testing$DBA)

Nearest Shrunken Centroids

ctrl_NSC <- trainControl(method = "repeatedcv", repeats = 3, 
                     classProbs = TRUE, summaryFunction = twoClassSummary)
grid_NSC <- data.frame(.threshold = 0:25)

NSC <- train(DBA ~ checks + violations + score + BORO, 
             data = training, method = "pam", 
             preProcess = c("center", "scale"), metric = "ROC", 
             trControl = ctrl_NSC, tuneGrid = grid_NSC)

z_NSC <- predict(NSC, newdata = testing) 
confusionMatrix(z_NSC, reference = testing$DBA)

Boosting

gbmControl = trainControl(method="cv", 
                          number=5, returnResamp = "all")

gbm = train(DBA ~ checks + violations + score + BORO,
            data=training, method="gbm",
            distribution="bernoulli", 
            trControl=gbmControl, verbose=F, 
            tuneGrid=data.frame(.n.trees=seq(100, 1000, 
                                              by = 100), 
                                 .shrinkage=c(0.01, 0.1), 
                                 .interaction.depth=1:2, 
                                 .n.minobsinnode=1:5),
             na.action = na.omit)

y_hat_gbm <- predict(gbm, newdata = testing, na.action = na.pass)

confusionMatrix(y_hat_gbm, reference = testing$DBA)

Random Forest

library(randomForest)
rf<- randomForest(DBA ~ checks + violations + score + BORO, 
                  data=training,
                  importance = TRUE,
                  na.action = na.omit)

y_hat_rf <- predict(rf, newdata = testing,
                 type = "response", na.action = na.pass)

confusionMatrix(y_hat_rf, reference = testing$DBA)

Neural Network Model

nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
                        .size = c(1:10))
ctrl_nn <- trainControl(method = "cv", number = 10)

nn <- train(DBA ~ checks + violations + score + as.factor(BORO), 
            data = training, method = "nnet",
            trControl = ctrl_nn, tuneGrid = nnetGrid,
            preProcess = c("center", "scale"), trace = FALSE)

y_hat_nn <- predict(nn, newdata = testing)

confusionMatrix(y_hat_nn, reference = testing$DBA)

MARS model

ctrl_mars <- trainControl(method = "cv", number = 10)
marsGrid <- expand.grid(.degree = 1:3, .nprune = 1:10)

mars <- train(DBA ~ checks + violations + score + as.factor(BORO),  
              data = training, method = "earth", 
              trControl = ctrl_mars, tuneGrid = marsGrid)

y_hat_mars <- predict(mars, newdata = testing)
confusionMatrix(y_hat_mars, reference = testing$DBA)


nikkyxiong/coffee_rating_and_nyc_demographics documentation built on May 16, 2020, 9:27 a.m.