R/source.R

install.packages("jsonlite")
library(jsonlite)
install.packages("rapportools")
library(rapportools)
install.packages("tidyverse")
library(tidyverse)
install.packages("dplyr")
library(dplyr)
install.packages("tm")
library(tm)
install.packages("fastDummies")
library(fastDummies)
install.packages("caret")
library(caret)
install.packages("kernlab")
library(kernlab)
install.packages("e1071")
library(e1071)
install.packages("arules")
library(arules)
install.packages("arulesViz")
library(arulesViz)
install.packages("MLmetrics")
library(MLmetrics)

setwd("C:/Data/study/IST-687/Project/ist687") # set this to the location where the json file is stored
source("munging.R") # set this to the location of the munging.R file

####### association rules mining on all data #######

df = jsonlite::fromJSON("fall2019-survey-M02.json")
dfBinnedData = getBinnedData(df)
dfTnx = as(dfBinnedData, "transactions")

##### association rules #####
rulesetPromoters <- apriori(dfTnx,
                            parameter=list(support=0.005,confidence=0.5),
                            appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Promoter")))

rulesetDetractors <- apriori(dfTnx,
                             parameter=list(support=0.05,confidence=0.8),
                             appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Detractor")))

summary(quality(rulesetPromoters)$lift)
summary(quality(rulesetDetractors)$lift)
length(rulesetPromoters[quality(rulesetPromoters)$lift > 2.35])
length(rulesetDetractors[quality(rulesetDetractors)$lift > 1.99])
arules::inspect(rulesetPromoters[quality(rulesetPromoters)$lift > 2.35])
arules::inspect(rulesetDetractors[quality(rulesetDetractors)$lift > 1.99])

inspectDT(rulesetPromoters[quality(rulesetPromoters)$lift > 2.35])
inspectDT(rulesetDetractors[quality(rulesetDetractors)$lift > 1.99])

# The columns with strong association were considered for classification modeling

##### classification models for all data #####
analysisColumns = c("Airline.Status", "Type.of.Travel", "Eating.and.Drinking.at.Airport", "Departure.Delay.in.Minutes", "Flights.Per.Year", "Price.Sensitivity", "olong", "dlat", "Total.Freq.Flyer.Accts")
analysisData = dfBinnedData[, names(dfBinnedData) %in% c(analysisColumns, "Likelihood.to.recommend") ]
analysisData$Likelihood.to.recommend[is.na(analysisData$Likelihood.to.recommend)] = "Passive"

analysisData = prepareForAnalysis(analysisData, analysisColumns)

fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

set.seed(1)
inTraining = createDataPartition(analysisData$Likelihood.to.recommend, p = 0.75, list = FALSE)
trainData = analysisData[inTraining,]
testData = analysisData[-inTraining,]

logitBoost <- train(factor(Likelihood.to.recommend) ~., data = trainData, method = "LogitBoost", trControl=fitControl, preProcess = c("center", "scale"), tuneLength = 10)
logitBoost
# train accuracy - 79.55%

plot(logitBoost)

result = predict(logitBoost, testData)
sum(result == testData$Likelihood.to.recommend)/length(testData$Likelihood.to.recommend)
# test accuracy - 80.48%

F1_Score(testData$Likelihood.to.recommend, result)
# F1 score - 87.67%

varImp(logitBoost)

personalTravelResult = predict(logitBoost, testData[testData$`Type.of.Travel_Personal Travel` == 1,])
sum(personalTravelResult == "Detractor")/nrow(testData[testData$`Type.of.Travel_Personal Travel` == 1,])
# Inference - Personal travel customers have a strong relationship with their ratings.


###### association rules mining on Personal Travel data #######

personalTravel = df[str_trim(df$Type.of.Travel) == "Personal Travel", names(df) != "Type.of.Travel"]
personalTravelBinned = getBinnedData(personalTravel)
personalTravelTnx = as(personalTravelBinned, "transactions")

rulesetPromoters <- apriori(personalTravelTnx,
                            parameter=list(support=0.005,confidence=0.5),
                            appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Promoter")))

rulesetDetractors <- apriori(personalTravelTnx,
                             parameter=list(support=0.05,confidence=0.8),
                             appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Detractor")))

summary(quality(rulesetPromoters)$lift)
summary(quality(rulesetDetractors)$lift)
length(rulesetDetractors[quality(rulesetDetractors)$lift > 1.178])
arules::inspect(rulesetPromoters)
arules::inspect(rulesetDetractors[quality(rulesetDetractors)$lift > 1.178])

inspectDT(rulesetPromoters)
inspectDT(rulesetDetractors[quality(rulesetDetractors)$lift > 1.178])

# The information was already available.
# Trying to get a better model with Airline.Status == "Blue" since this has the bulk of the data (2504/3212).

personalBlue = df[str_trim(df$Type.of.Travel) == "Personal Travel" & str_trim(df$Airline.Status) == "Blue", !names(df) %in% c("Type.of.Travel", "Airline.Status")]
personalBlueBinned = getBinnedData(personalBlue)
personalBlueTnx = as(personalBlueBinned, "transactions")

rulesetPromoters <- apriori(personalBlueTnx,
                            parameter=list(support=0.002,confidence=0.5),
                            appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Promoter")))

rulesetDetractors <- apriori(personalBlueTnx,
                             parameter=list(support=0.05,confidence=0.8),
                             appearance = list(default="lhs", rhs=("Likelihood.to.recommend=Detractor")))

summary(quality(rulesetPromoters)$lift)
summary(quality(rulesetDetractors)$lift)
length(rulesetPromoters[quality(rulesetPromoters)$lift > 28])
length(rulesetDetractors[quality(rulesetDetractors)$lift > 1.11])
arules::inspect(rulesetPromoters[quality(rulesetPromoters)$lift > 28])
arules::inspect(rulesetDetractors[quality(rulesetDetractors)$lift > 1.11])

inspectDT(rulesetPromoters[quality(rulesetPromoters)$lift > 28])
inspectDT(rulesetDetractors[quality(rulesetDetractors)$lift > 1.11])

# The columns with strong association were considered for classification modeling.

##### classification models for Personal Travel, Blue data #####
analysisColumns1 = c("Age", "Gender", "Flight.Distance", "Eating.and.Drinking.at.Airport",
                     "olong", "Arrival.Delay.in.Minutes", "Loyalty", "Total.Freq.Flyer.Accts")
analysisData1 = personalBlueBinned[, names(personalBlueBinned) %in% c(analysisColumns1, "Likelihood.to.recommend")]
analysisData1 = prepareForAnalysis(analysisData1, analysisColumns1)

set.seed(1)
inTraining1 = createDataPartition(analysisData1$Likelihood.to.recommend, p = .75, list = FALSE)
trainData1 = analysisData1[inTraining1,]
testData1 = analysisData1[-inTraining1,]

fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

logitBoost1 <- train(factor(Likelihood.to.recommend) ~., data = trainData1, method = "LogitBoost", trControl=fitControl, preProcess = c("center", "scale"), tuneLength = 10)
logitBoost1
# train accuracy - 89.84%

plot(logitBoost1)

result5 = predict(logitBoost1, newdata = testData1)
sum(result5 == testData1$Likelihood.to.recommend)/length(testData1$Likelihood.to.recommend)
# test accuracy - 89.76%

F1_Score(testData1$Likelihood.to.recommend, result5)
# F1 score - 94.68%

varImp(logitBoost1)

svmRadial1 <- train(factor(Likelihood.to.recommend) ~., data = trainData1, method = "svmRadial", trControl=fitControl, preProcess = c("center", "scale"), tuneLength = 10)
svmRadial1
# train accuracy - 89.92%

plot(svmRadial1)

result6 = predict(svmRadial1, newdata = testData1)
sum(result6 == testData1$Likelihood.to.recommend)/length(testData1$Likelihood.to.recommend)
# test accuracy - 89.44%

F1_Score(testData1$Likelihood.to.recommend, result6)
# F1 score - 94.4%

varImp(svmRadial1)

femalePredict = predict(svmRadial1, testData1[testData1$Gender_Male == 0,])
sum(femalePredict == "Detractor")/nrow(testData1[testData1$Gender_Male == 0,])

varImp(svmRadial1)
sum(personalBlueBinned$Likelihood.to.recommend == "Detractor")/length(personalBlueBinned$Likelihood.to.recommend)
# The high accuracy of the model proves the strong association between the customers and their ratings.
# The female customers with type of travel as Personal, airline status as Blue and with no frequent flyer accounts tend to be detactors.
# Customers with origin longitude between -120 - -95 tend to be detractors.

nrow(df[df$Gender == "Female",])/nrow(df)
nrow(personalBlue[personalBlue$Gender == "Female",])/nrow(personalBlue)

mean(df$Likelihood.to.recommend[df$Gender == "Female"])
mean(personalBlue$Likelihood.to.recommend[personalBlue$Gender == "Female"])

# The airline should focus more on the female customers with type of travel as Personal and airline status
# as Blue as their average ratings are significantly lower than that of all female customers and they have
# a tendency to be detractors as proven by the model. They also contribute more to the overall ratings as their
# ratio is higher in this category than the overall data.

nrow(df[df$Gender == "Female" & df$Total.Freq.Flyer.Accts == 0,])/nrow(df)
nrow(personalBlue[personalBlue$Gender == "Female" & personalBlue$Total.Freq.Flyer.Accts == 0,])/nrow(personalBlue)

mean(df$Likelihood.to.recommend[df$Gender == "Female" & df$Total.Freq.Flyer.Accts == 0])
mean(personalBlue$Likelihood.to.recommend[personalBlue$Gender == "Female" & personalBlue$Total.Freq.Flyer.Accts == 0])

# The above conclusion is also true for female customers with no frequent flyer accounts.
nrow(dfBinnedData[dfBinnedData$Type.of.Travel == "Personal Travel" & df$Airline.Status == "Blue" & df$Gender == "Female",])/nrow(dfBinnedData)
mean(df$Likelihood.to.recommend[df$Type.of.Travel == "Personal Travel"])
mean(df$Likelihood.to.recommend)
mean(df$Likelihood.to.recommend[df$Type.of.Travel == "Personal Travel" & df$Airline.Status == "Blue" & df$Gender == "Female"])
mean(df$Likelihood.to.recommend[df$Gender == "Female"])
aatishsuman/IST687-southeast-airlines-customer-churn documentation built on Dec. 18, 2021, 9:28 p.m.