# 1.平衡数据 ------------------------------------------------------------------
#' Balance Data
#' @param Raw_data Raw data but no label
#' @param Label Label of the data
#' @param Tyep Choose what method to balance the data
#' @return Balanced data
#' @example
#' BalanceData(iris[,-4],iris[,5])
# the balancing technique to use (ubOver, ubUnder, ubSMOTE, ubOSS,
#ubCNN, ubENN, ubNCL, ubTomek).
# ubTomek: only numeric features are allowed to compute nearest neighbors
# 输入 :原始特征 标签 平衡的方法
# 输出 : 平衡后的样本
BalanceData <- function(Raw_data, Label, Type = 'ubUnder') {
require(unbalanced)
input <- Raw_data
output <- Label
output <- as.factor(output)
print(paste("样本标签的比例为:", table(output)[1] / table(output)[2]))
data <- ubBalance(X = input, Y = output, type = Type)
# data<-ubCNN(X= input,Y=output)
balancedData <- cbind(data$X, data$Y)
names(balancedData) <- c(names(input), 'label')
print(paste('平衡之后的标签比例为:', table(data$Y)[1] / table(data$Y)[2]))
return(balancedData)
}
# 2.IV --------------------------------------------------------------------
#' Calculate IV and perform woe transformation
#' @param Raw_data Raw data but no label
#' @param Label Label of the data
#' @param Woe_t The default is F, indicating whether to return the data after the woe transformation
#' @param Filter a value indicating that the variable whose IV is greater than the value is filtered out
#' @return Returns the IV value of an l variable, the variable transformed by woe, the name of the feature after filtering...
# 计算IV,进行woe变换
# 输入 : 原始数据 标签 是否进行woe变换 是否筛选变量(输入一个IV值0-1)
# 输出 : 1.woe变换的变量 2. 筛选后的特征名字
# 8月.6日 碰到一个BUG ,结果是Error #13: Assertion failure at kmp_runtime.cpp(6480).
# 所以需要重新进行woe变换
IV_WOE <- function(Raw_data,
Label,
Woe_t = F,
Filter = 0) {
Data <- cbind(Raw_data, Label)
label <- names(Data)[ncol(Data)]
require(Information)
require(woe)
require(scorecard)
require(tidyverse)
IV <- scorecard::iv(B_New_cod_sample[, -1], y = 'label')
print(IV)
if (Woe_t == T) {
bins <- scorecard::woebin(Data, y = label)
print(bins)
dt_woe <- scorecard::woebin_ply(Data, bins)
l <- list(dt_woe,bins,IV)
names(l) <- c("woe变换后的结果","woe变换的详细数据","总的IV值")
return(l)
}
if (Filter != 0) {
N <- IV %>% filter(info_value > Filter) %>% select(variable)
return(N)
}
}
# 3.划分数据集 -------------------------------------------------------------------
# 生成训练样本,测试样本 -------------------------------------------------------------
# 输入 : 原始数据集 标签 划分的比例
#' Dividing data sets
#' @param Raw_data Raw data
#' @param Label label
#' @param rate the rate to split data
#' @return return train and test
SplitSample <- function(Raw_data, Label, rate) {
require(caret)
div_part_2 <-
createDataPartition(y = Label, p = rate, list = F)
# Training Sample for Neural Network
train_num <- Raw_data[div_part_2, ] # 70% here
train_num <- cbind(train_num, Label[div_part_2])
# Test Sample for Neural Network
test_num <-
Raw_data[-div_part_2, ] # rest of the 30% data goes here
test_num <- cbind(test_num, Label[-div_part_2])
names(train_num) <- c(names(Raw_data), 'label')
names(test_num) <- c(names(Raw_data), 'label')
return(list(train_num, test_num))
}
# 4.训练模型 ------------------------------------------------------------------
# caret 他训练模型 --------------------------------------------------------------------
#' Train A Model
#' @param train train data
#' @param test test data
#' @param model which model you chioce
#' @param search random search
#' @return no return
Caret_Model <- function(train, test, model, search = F) {
require(caret)
if (search == F) {
print('build model---------------------------------')
Model <- train(label ~ ., data = train, method = model)
}
if (search == T) {
print('build model---------------------------------')
Model <- train(
label ~ .,
data = train,
method = model,
trControl = trainControl(search = 'random')
)
}
print('predict---------------------------------')
dt_pred1 = predict(Model, type = 'prob', test)
require(scorecard)
perf_eva(test$label, dt_pred1$`1`, type = c("ks", "lift", "roc", "pr"))
}
# 5.训练模型mlr ---------------------------------------------------------------
#' Train A Model
#' @param train train data
#' @param test test data
#' @param model which model you chioce
Mlr_Modle <- function(Train,Test,model){
require(mlr)
tasktrain <- makeClassifTask(data = Train,target = "label")
tasktest <- makeClassifTask(data = Test,target = "label")
lnr <- makeLearner(cl = model,predict.type = 'prob')
a <- Sys.time()
print(paste("正在构建模型","构建的模型是:",model))
mdl <- mlr::train(lnr,tasktrain)
print(paste("模型构建所花时间为:",Sys.time()-a,"秒"))
print("正在评估模型··································")
prd <- predict(mdl,tasktest)
require(scorecard)
perf_eva(test$label, prd$data[,4], type = c("ks", "lift", "roc", "pr"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.