rm(list = ls()) # dir <- 'F:/Projects/Rpackage/streamlineR' knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
# If the default mirror is blocked, choose another mirror to install R packages, # chooseCRANmirror() install.packages(c('dplyr', 'car', 'caret', 'e1071', 'knitr', 'reshape2', 'corrplot','rpart', 'scales', 'shiny', 'survival', 'gridExtra', 'devtools', 'pec', 'MASS', 'pROC', 'manipulate'), dependencies = TRUE)
Load Dependent Packages
# Load pacakges sapply(c('dplyr', 'car', 'caret', 'e1071', 'knitr', 'reshape2', 'corrplot','rpart', 'scales', 'survival', 'gridExtra', 'devtools', 'pec', 'MASS', 'pROC', 'manipulate', 'shiny'), require, character.only = TRUE)
# shinyapp can not recognizes the libraries needed if they are load all together # with the sapply function as above. The library must be load one by one! library(dplyr) library(car) library(caret) library(e1071) library(knitr) library(reshape2) library(corrplot) library(rpart) library(scales) library(survival) library(gridExtra) library(devtools) library(pec) library(MASS) library(pROC) library(manipulate) library(streamlineR)
Install streamlineR
from Github
# Ff the install_github does not work, you can download the package from github, # and then install the package locally: # https://api.github.com/repos/JianhuaHuang/streamlineR/zipball/master devtools::install_github('JianhuaHuang/streamlineR') library(streamlineR)
dt <- survival::pbc %>% transmute(age = round(age), gender = sex, platelet, stage = as.character(stage), time, status = as.numeric(status %in% c(1, 2))) %>% filter(!is.na(stage)) set.seed(1111) # reproducible dt <- dt[sample(nrow(dt), 10000, replace = T), ] row.names(dt) <- NULL dim(dt) str(dt) head(dt)
Split Data into Training and Test Datasets
set.seed(1111) ind.train <- createDataPartition(dt$status, p = .7, list = FALSE) dt.train <- dt[ind.train, ] dt.test <- dt[-ind.train, ] row.names(dt.train) <- NULL row.names(dt.test) <- NULL dim(dt.train) dim(dt.test) # preserve the original values dt.train.bk <- dt.train dt.test.bk <- dt.test
bin.knn
bin.knn(status ~ platelet, data = dt.train, n.group = 5, min.bucket = 0.05)
manipulate(bin.knn(status ~ platelet, data = dt.train, n.group, min.bucket), n.group = slider(1, 10, step = 1, initial = 5, label = 'Number of Groups'), min.bucket = slider(0.01, .2, step = 0.01, initial = 0.05, label = 'Minimum Bucket'))
inputPanel( sliderInput("n_group", label = 'Number of Groups', min = 1, max = 9, value = 5, step = 1, ticks = FALSE), sliderInput("min_pop", label = 'Minimum Bucket', min = 0.01, max = .2, value = .05, step = .01, ticks = FALSE) ) renderPlot({ bin.knn(formula = status ~ platelet, data = dt.train.bk, n.group = input$n_group, min.bucket = input$min_pop) })
Sometimes the web server is not stable. Please reload/refresh the page, if you can not see a figure above
bin.rpart
rpart
rpart(formula = status ~ age, data = dt.train, control = rpart.control(minbucket = .01 * nrow(dt.train)))
Binning for Logistic Model
lg.bin.age <- bin.rpart(formula = status ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train))) str(lg.bin.age) lg.bin.platelet <- bin.rpart(formula = status ~ platelet, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)))
Binning for Survival Model
## complexity parameter (cp) = 0.01 surv.bin.age <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)))
Search for Appropriate Number of Cut Points
inputPanel( sliderInput("cp", label = 'cp', min = 0.001, max = .03, value = .01, step = .001, ticks = FALSE) ) renderPrint({ rs <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train.bk, rcontrol = rpart.control(cp = input$cp, minbucket = .01 * nrow(dt.train.bk))) })
Sometimes the web server is not stable. Please reload/refresh the page, if you can not see cut points above
surv.bin.age2 <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)), n.group = 3:7)
Replace Numerical Varialbes with Bins
# We don't need the time column anmore, delete it in both dt.train and dt.test dt.train <- dplyr::select(dt.train, -time) dt.test <- dplyr::select(dt.test, -time) head(dt.train) dt.train$age <- lg.bin.age$bins dt.train$platelet <- lg.bin.platelet$bins head(dt.train)
level.stat
col.x <- c('age', 'gender', 'platelet', 'stage') stat.train <- level.stat(dt.train, x = col.x, y = 'status') head(stat.train)
ggstat
ggstat(data = stat.train, var = "Variable.IV", x = "Group", y = "Rate.1", y.label = "Perc.1", y.label.col = "red", y.title = NULL, bar.col = "cornflowerblue", width = "Rate.group", width.label = "Perc.group", width.label.col = "black", ncol = NULL, theme = 'classic', background = "white")
Constant Bar Width
ggstat(stat.train, width = .2)
Plot WOE
ggstat(stat.train, y = 'WOE', y.label = 'WOE.round', width = .2, width.label = NULL, ncol = 4)
replace.woe
replace.woe(data = dt.train, stat = stat.train, replace = FALSE) %>% head dt.train <- replace.woe(data = dt.train, stat = stat.train, replace = TRUE) head(dt.train)
ggcorr
cor.mat <- cor(dt.train[, col.x]) corrplot(cor.mat) ggcorr(cor.mat) # Random example set.seed(1111) data.random <- matrix(runif(100), 10) colnames(data.random) <- paste('Variable', 1:10) cor.random <- cor(data.random) ggcorr(cor.random) # default output ggcorr(cor.random, var.position = 'diagonal', add.legend = FALSE) ggcorr(cor.random, lower = TRUE) ggcorr(cor.random, lower = TRUE, var.position = 'diagonal', high = 'blue', low = 'green')
lg <- glm(status ~ ., dt.train, family=binomial(link='logit')) summary(lg)
Stepwise Variable Selection
lg.aic <- stepAIC(lg, k = qchisq(0.05, 1, lower.tail=F)) # p to enter: 0.05 summary(lg.aic) data.frame(vif(lg.aic))
bin.custom & replace.woe
bin.custom
head(dt.test) dt.test$age <- bin.custom(dt.test$age, cut.p = lg.bin.age$cut.points) dt.test$platelet <- bin.custom(dt.test$platelet, cut.p = lg.bin.platelet$cut.points) head(dt.test)
Replace Binned Test Data with WOE: replace.woe
dt.test <- replace.woe(dt.test, stat = stat.train, replace = TRUE) head(dt.test)
perf.auc & perf.decile
perf.auc
perf.auc(model = lg.aic, dt.train, dt.test)
Check Performance Based on Decile Rate: perf.decile
pred.test <- predict(lg.aic, newdata = dt.test, type = 'response') perf.decile(actual = dt.test$status, pred = pred.test, add.legend = TRUE)
coef2rate
summary(lg.aic) pred.stat <- coef2rate(data = dt.test, model = lg.aic, stat = stat.train, force.change = TRUE) head(pred.stat) ggstat(pred.stat, y = 'Pred.Rate.1', y.label = 'Pred.Perc.1')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.