MLDataR.R
In MLDataR: Collection of Machine Learning Datasets for Supervised Machine Learning

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.height= 5, 
  fig.width=7
)

## ----setup, include = FALSE, echo=FALSE---------------------------------------
library(MLDataR)
library(dplyr)
library(ConfusionTableR)
library(parsnip)
library(rsample)
library(recipes)
library(ranger)
library(workflows)
library(caret)


## ----install_MLDataR----------------------------------------------------------
#install.packages(MLDataR)
library(MLDataR)


## ----thyroid_data-------------------------------------------------------------

glimpse(MLDataR::thyroid_disease)


## ----data_prep----------------------------------------------------------------
data("thyroid_disease")
td <- thyroid_disease
# Create a factor of the class label to use in ML model
td$ThryroidClass <- as.factor(td$ThryroidClass)
# Check the structure of the data to make sure factor has been created
str(td)

## ----remove_nulls-------------------------------------------------------------
# Remove missing values, or choose more advaced imputation option
td <- td[complete.cases(td),]
#Drop the column for referral source
td <- td %>%
   dplyr::select(-ref_src)


## ----splitting----------------------------------------------------------------
#Divide the data into a training test split
set.seed(123)
split <- rsample::initial_split(td, prop=3/4)
train_data <- rsample::training(split)
test_data <- rsample::testing(split)


## ----create_recipe------------------------------------------------------------
td_recipe <-
   recipe(ThryroidClass ~ ., data=train_data) %>%
   step_normalize(all_predictors()) %>%
   step_zv(all_predictors())

print(td_recipe)

## ----random_forest_model------------------------------------------------------
set.seed(123)
rf_mod <-
  parsnip::rand_forest() %>%
  set_engine("ranger") %>%
  set_mode("classification")



## ----creating_workflow--------------------------------------------------------
td_wf <-
   workflow() %>%
   workflows::add_model(rf_mod) %>%
   workflows::add_recipe(td_recipe)

print(td_wf)
# Fit the workflow to our training data
set.seed(123)
td_rf_fit <-
   td_wf %>%
   fit(data = train_data)
# Extract the fitted data
td_fitted <- td_rf_fit %>%
    extract_fit_parsnip()


## ----make_preds_and_evaluate--------------------------------------------------
# Predict the test set on the training set to see model performance
class_pred <- predict(td_rf_fit, test_data)
td_preds <- test_data %>%
    bind_cols(class_pred)
# Convert both to factors
td_preds$.pred_class <- as.factor(td_preds$.pred_class)
td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)

str(td_preds)

# Evaluate the data with ConfusionTableR
cm <- binary_class_cm(td_preds$.pred_class,
                      td_preds$ThryroidClass,
                      positive="sick")




## ----modelling_preds----------------------------------------------------------
#View Confusion matrix
cm$confusion_matrix
#View record level
cm$record_level_cm


## ----diabetes-----------------------------------------------------------------
glimpse(MLDataR::diabetes_data)

## ----load_in_heart------------------------------------------------------------
data(heartdisease)
# Convert diabetes data to factor'
hd <- heartdisease %>%
 mutate(HeartDisease = as.factor(HeartDisease))
is.factor(hd$HeartDisease)

## ----dummy_encode-------------------------------------------------------------
# Get categorical columns
hd_cat <- hd  %>%
  dplyr::select_if(is.character)
# Dummy encode the categorical variables 
 cols <- c("RestingECG", "Angina", "Sex")
# Dummy encode using dummy_encoder in ConfusionTableR package
coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
coded <- coded %>%
     select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
     Sex=Sex_F)
# Remove column names we have encoded from original data frame
hd_one <- hd[,!names(hd) %in% cols]
# Bind the numerical data on to the categorical data
hd_final <- bind_cols(coded, hd_one)
# Output the final encoded data frame for the ML task
glimpse(hd_final)

## ----ls_one-------------------------------------------------------------------
library(MLDataR)
library(dplyr)
library(ggplot2)
library(caret)
library(rsample)
library(varhandle)

data("long_stayers")
glimpse(long_stayers)


## ----ls_two-------------------------------------------------------------------
long_stayers <- long_stayers %>% 
  dplyr::mutate(stranded.label=factor(stranded.label)) %>% 
  dplyr::select(everything(), -c(admit_date))

cats <- select_if(long_stayers, is.character)
cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind") 
#Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix
cat_dummy <- cat_dummy %>% 
  as.data.frame() %>% 
  dplyr::select(-frail_ind.No_index_item) #Drop the field of interest
# Drop the frailty index from the stranded data frame and bind on our new encoding categorical variables
long_stayers <- long_stayers %>% 
  dplyr::select(-frailty_index) %>% 
  bind_cols(cat_dummy) %>% na.omit(.)

## ----ls_three-----------------------------------------------------------------
split <- rsample::initial_split(long_stayers, prop = 3/4)
train <- rsample::training(split)
test <- rsample::testing(split)

set.seed(123)
glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
                 method = "glm")
print(glm_class_mod)

## ----ls_four------------------------------------------------------------------
split <- rsample::initial_split(long_stayers, prop = 3/4)
train <- rsample::training(split)
test <- rsample::testing(split)

set.seed(123)
glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 
                 method = "glm")
print(glm_class_mod)

## ----ls_five------------------------------------------------------------------
preds <- predict(glm_class_mod, newdata = test) # Predict class
pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs

# Join prediction on to actual test data frame and evaluate in confusion matrix

predicted <- data.frame(preds, pred_prob)
test <- test %>% 
  bind_cols(predicted) %>% 
  dplyr::rename(pred_class=preds)

glimpse(test)

## ----ls_six-------------------------------------------------------------------
library(ConfusionTableR)
cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded")
cm$record_level_cm

library(OddsPlotty)
plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel,
                                title = "Odds Plot ",
                                subtitle = "Showing odds of patient stranded",
                                point_col = "#00f2ff",
                                error_bar_colour = "black",
                                point_size = .5,
                                error_bar_width = .8,
                                h_line_color = "red")
print(plotty)

Any scripts or data that you put into this service are public.

MLDataR documentation built on Oct. 3, 2022, 5:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MLDataR
Collection of Machine Learning Datasets for Supervised Machine Learning

inst/doc/MLDataR.R
In MLDataR: Collection of Machine Learning Datasets for Supervised Machine Learning

Try the MLDataR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MLDataR Collection of Machine Learning Datasets for Supervised Machine Learning

inst/doc/MLDataR.R In MLDataR: Collection of Machine Learning Datasets for Supervised Machine Learning

Try the MLDataR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MLDataR
Collection of Machine Learning Datasets for Supervised Machine Learning

inst/doc/MLDataR.R
In MLDataR: Collection of Machine Learning Datasets for Supervised Machine Learning