Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.height= 5,
fig.width=7
)
## ----setup, include = FALSE, echo=FALSE---------------------------------------
library(MLDataR)
library(dplyr)
library(ConfusionTableR)
library(parsnip)
library(rsample)
library(recipes)
library(ranger)
library(workflows)
library(caret)
## ----install_MLDataR----------------------------------------------------------
#install.packages(MLDataR)
library(MLDataR)
## ----thyroid_data-------------------------------------------------------------
glimpse(MLDataR::thyroid_disease)
## ----data_prep----------------------------------------------------------------
data("thyroid_disease")
td <- thyroid_disease
# Create a factor of the class label to use in ML model
td$ThryroidClass <- as.factor(td$ThryroidClass)
# Check the structure of the data to make sure factor has been created
str(td)
## ----remove_nulls-------------------------------------------------------------
# Remove missing values, or choose more advaced imputation option
td <- td[complete.cases(td),]
#Drop the column for referral source
td <- td %>%
dplyr::select(-ref_src)
## ----splitting----------------------------------------------------------------
#Divide the data into a training test split
set.seed(123)
split <- rsample::initial_split(td, prop=3/4)
train_data <- rsample::training(split)
test_data <- rsample::testing(split)
## ----create_recipe------------------------------------------------------------
td_recipe <-
recipe(ThryroidClass ~ ., data=train_data) %>%
step_normalize(all_predictors()) %>%
step_zv(all_predictors())
print(td_recipe)
## ----random_forest_model------------------------------------------------------
set.seed(123)
rf_mod <-
parsnip::rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
## ----creating_workflow--------------------------------------------------------
td_wf <-
workflow() %>%
workflows::add_model(rf_mod) %>%
workflows::add_recipe(td_recipe)
print(td_wf)
# Fit the workflow to our training data
set.seed(123)
td_rf_fit <-
td_wf %>%
fit(data = train_data)
# Extract the fitted data
td_fitted <- td_rf_fit %>%
extract_fit_parsnip()
## ----make_preds_and_evaluate--------------------------------------------------
# Predict the test set on the training set to see model performance
class_pred <- predict(td_rf_fit, test_data)
td_preds <- test_data %>%
bind_cols(class_pred)
# Convert both to factors
td_preds$.pred_class <- as.factor(td_preds$.pred_class)
td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)
str(td_preds)
# Evaluate the data with ConfusionTableR
cm <- binary_class_cm(td_preds$.pred_class,
td_preds$ThryroidClass,
positive="sick")
## ----modelling_preds----------------------------------------------------------
#View Confusion matrix
cm$confusion_matrix
#View record level
cm$record_level_cm
## ----diabetes-----------------------------------------------------------------
glimpse(MLDataR::diabetes_data)
## ----load_in_heart------------------------------------------------------------
data(heartdisease)
# Convert diabetes data to factor'
hd <- heartdisease %>%
mutate(HeartDisease = as.factor(HeartDisease))
is.factor(hd$HeartDisease)
## ----dummy_encode-------------------------------------------------------------
# Get categorical columns
hd_cat <- hd %>%
dplyr::select_if(is.character)
# Dummy encode the categorical variables
cols <- c("RestingECG", "Angina", "Sex")
# Dummy encode using dummy_encoder in ConfusionTableR package
coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
coded <- coded %>%
select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
Sex=Sex_F)
# Remove column names we have encoded from original data frame
hd_one <- hd[,!names(hd) %in% cols]
# Bind the numerical data on to the categorical data
hd_final <- bind_cols(coded, hd_one)
# Output the final encoded data frame for the ML task
glimpse(hd_final)
## ----ls_one-------------------------------------------------------------------
library(MLDataR)
library(dplyr)
library(ggplot2)
library(caret)
library(rsample)
library(varhandle)
data("long_stayers")
glimpse(long_stayers)
## ----ls_two-------------------------------------------------------------------
long_stayers <- long_stayers %>%
dplyr::mutate(stranded.label=factor(stranded.label)) %>%
dplyr::select(everything(), -c(admit_date))
cats <- select_if(long_stayers, is.character)
cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind")
#Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix
cat_dummy <- cat_dummy %>%
as.data.frame() %>%
dplyr::select(-frail_ind.No_index_item) #Drop the field of interest
# Drop the frailty index from the stranded data frame and bind on our new encoding categorical variables
long_stayers <- long_stayers %>%
dplyr::select(-frailty_index) %>%
bind_cols(cat_dummy) %>% na.omit(.)
## ----ls_three-----------------------------------------------------------------
split <- rsample::initial_split(long_stayers, prop = 3/4)
train <- rsample::training(split)
test <- rsample::testing(split)
set.seed(123)
glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
method = "glm")
print(glm_class_mod)
## ----ls_four------------------------------------------------------------------
split <- rsample::initial_split(long_stayers, prop = 3/4)
train <- rsample::training(split)
test <- rsample::testing(split)
set.seed(123)
glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
method = "glm")
print(glm_class_mod)
## ----ls_five------------------------------------------------------------------
preds <- predict(glm_class_mod, newdata = test) # Predict class
pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs
# Join prediction on to actual test data frame and evaluate in confusion matrix
predicted <- data.frame(preds, pred_prob)
test <- test %>%
bind_cols(predicted) %>%
dplyr::rename(pred_class=preds)
glimpse(test)
## ----ls_six-------------------------------------------------------------------
library(ConfusionTableR)
cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded")
cm$record_level_cm
library(OddsPlotty)
plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel,
title = "Odds Plot ",
subtitle = "Showing odds of patient stranded",
point_col = "#00f2ff",
error_bar_colour = "black",
point_size = .5,
error_bar_width = .8,
h_line_color = "red")
print(plotty)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.