rm(list = ls()) # dir <- 'F:/Projects/Rpackage/streamlineR' knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
# If the default mirror is blocked, choose another mirror to install R packages, # chooseCRANmirror() install.packages(c('dplyr', 'car', 'caret', 'e1071', 'knitr', 'reshape2', 'corrplot','rpart', 'scales', 'shiny', 'survival', 'gridExtra', 'devtools', 'pec', 'MASS', 'pROC', 'manipulate'), dependencies = TRUE)
加载依赖包
# Load pacakges sapply(c('dplyr', 'car', 'caret', 'e1071', 'knitr', 'reshape2', 'corrplot','rpart', 'scales', 'survival', 'gridExtra', 'devtools', 'pec', 'MASS', 'pROC', 'manipulate', 'shiny'), require, character.only = TRUE)
# shinyapp can not recognizes the libraries needed if they are load all together # with the sapply function as above. The library must be load one by one! library(dplyr) library(car) library(caret) library(e1071) library(knitr) library(reshape2) library(corrplot) library(rpart) library(scales) library(survival) library(gridExtra) library(devtools) library(pec) library(MASS) library(pROC) library(manipulate) library(shiny) library(streamlineR)
从GitHub上安装streamlineR
# Ff the install_github does not work, you can download the package from github, # and then install the package locally: # https://api.github.com/repos/JianhuaHuang/streamlineR/zipball/master devtools::install_github('JianhuaHuang/streamlineR') library(streamlineR)
dt <- survival::pbc %>% transmute(age = round(age), gender = sex, platelet, stage = as.character(stage), time, status = as.numeric(status %in% c(1, 2))) %>% filter(!is.na(stage)) set.seed(1111) # reproducible dt <- dt[sample(nrow(dt), 10000, replace = T), ] row.names(dt) <- NULL dim(dt) str(dt) head(dt)
把数据分成训练数据跟测试数据
set.seed(1111) ind.train <- createDataPartition(dt$status, p = .7, list = FALSE) dt.train <- dt[ind.train, ] dt.test <- dt[-ind.train, ] row.names(dt.train) <- NULL row.names(dt.test) <- NULL dim(dt.train) dim(dt.test) # preserve the original values dt.train.bk <- dt.train dt.test.bk <- dt.test
bin.knn
bin.knn(status ~ platelet, data = dt.train, n.group = 5, min.bucket = 0.05)
manipulate(bin.knn(status ~ platelet, data = dt.train, n.group, min.bucket), n.group = slider(1, 10, step = 1, initial = 5, label = 'Number of Groups'), min.bucket = slider(0.01, .2, step = 0.01, initial = 0.05, label = 'Minimum Bucket'))
inputPanel( sliderInput("n_group", label = 'Number of Groups', min = 1, max = 9, value = 5, step = 1, ticks = FALSE), sliderInput("min_pop", label = 'Minimum Bucket', min = 0.01, max = .2, value = .05, step = .01, ticks = FALSE) ) renderPlot({ bin.knn(formula = status ~ platelet, data = dt.train.bk, n.group = input$n_group, min.bucket = input$min_pop) })
如果上图不能正确显示,请刷新网页
bin.rpart
rpart
(recursive partitioning)
rpart(formula = status ~ age, data = dt.train, control = rpart.control(minbucket = .01 * nrow(dt.train)))
逻辑回归分箱
lg.bin.age <- bin.rpart(formula = status ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train))) str(lg.bin.age) lg.bin.platelet <- bin.rpart(formula = status ~ platelet, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)))
生存模型分箱
## complexity parameter (cp) = 0.01 surv.bin.age <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)))
自动寻找合适的分箱数
inputPanel( sliderInput("cp", label = 'cp', min = 0.001, max = .03, value = .01, step = .001, ticks = FALSE) ) renderPrint({ rs <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train.bk, rcontrol = rpart.control(cp = input$cp, minbucket = .01 * nrow(dt.train.bk))) })
surv.bin.age2 <- bin.rpart(formula = Surv(time, status) ~ age, data = dt.train, rcontrol = rpart.control(minbucket = .01 * nrow(dt.train)), n.group = 3:7)
替换数值型变量
# We don't need the time column anmore, delete it in both dt.train and dt.test dt.train <- dplyr::select(dt.train, -time) dt.test <- dplyr::select(dt.test, -time) head(dt.train) dt.train$age <- lg.bin.age$bins dt.train$platelet <- lg.bin.platelet$bins head(dt.train)
level.stat
col.x <- c('age', 'gender', 'platelet', 'stage') stat.train <- level.stat(dt.train, x = col.x, y = 'status') head(stat.train)
ggstat
ggstat(data = stat.train, var = "Variable.IV", x = "Group", y = "Rate.1", y.label = "Perc.1", y.label.col = "red", y.title = NULL, bar.col = "cornflowerblue", width = "Rate.group", width.label = "Perc.group", width.label.col = "black", ncol = NULL, theme = 'classic', background = "white")
统一条形宽度
ggstat(stat.train, width = .2)
用WOE作图
ggstat(stat.train, y = 'WOE', y.label = 'WOE.round', width = .2, width.label = NULL, ncol = 4)
replace.woe
replace.woe(data = dt.train, stat = stat.train, replace = FALSE) %>% head dt.train <- replace.woe(data = dt.train, stat = stat.train, replace = TRUE) head(dt.train)
ggcorr
cor.mat <- cor(dt.train[, col.x]) corrplot(cor.mat) ggcorr(cor.mat) # Random example set.seed(1111) data.random <- matrix(runif(100), 10) colnames(data.random) <- paste('Variable', 1:10) cor.random <- cor(data.random) ggcorr(cor.random) # default output ggcorr(cor.random, var.position = 'diagonal', add.legend = FALSE) ggcorr(cor.random, lower = TRUE) ggcorr(cor.random, lower = TRUE, var.position = 'diagonal', high = 'blue', low = 'green')
lg <- glm(status ~ ., dt.train, family=binomial(link='logit')) summary(lg)
分步(stepwise)变量筛选
lg.aic <- stepAIC(lg, k = qchisq(0.05, 1, lower.tail=F)) # p to enter: 0.05 summary(lg.aic) data.frame(vif(lg.aic))
bin.custom & replace.woe
bin.custom
head(dt.test) dt.test$age <- bin.custom(dt.test$age, cut.p = lg.bin.age$cut.points) dt.test$platelet <- bin.custom(dt.test$platelet, cut.p = lg.bin.platelet$cut.points) head(dt.test)
利用WOE代替分类变量: replace.woe
dt.test <- replace.woe(dt.test, stat = stat.train, replace = TRUE) head(dt.test)
perf.auc & perf.decile
perf.auc
perf.auc(model = lg.aic, dt.train, dt.test)
基于十分位的性能检测: perf.decile
pred.test <- predict(lg.aic, newdata = dt.test, type = 'response') perf.decile(actual = dt.test$status, pred = pred.test, add.legend = TRUE)
coef2rate
summary(lg.aic) pred.stat <- coef2rate(data = dt.test, model = lg.aic, stat = stat.train, force.change = TRUE) head(pred.stat) ggstat(pred.stat, y = 'Pred.Rate.1', y.label = 'Pred.Perc.1')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.