##******author:zhonghongfa******
##******create:2017-09-20*******
##******update:2018-10-18*******
#' @title Read Dataset Based on Data-dictionary
#'
#' @description
#' \code{fread_basedict} is a wrapper of \code{fread}, which allows you read dataset based on your data-dictionary(see details).
#'
#' @details
#' the format of data-dictionary is required:
#' 1.the first column must be names of variables;
#' 2.the second column must be labels of variables;
#' 3.the third column must be flags which indicate whether it is a variable;
#' 4.the fourth column must be types of variables, and the value in this column should be 'VARCHAR', 'NUMBER' or NULL.
#'
#' @param myfile File name character of data set, must be a csv file.
#' @param mydict File name character of variable dictionary, must be a csv file.
#' @param na A character vector which to be interpreted as NA values, default \code{c("",".","NA","N/A","NULL")}.
#' @param excludevar A vector consisting of the variable names want to be excluded in 'mydict' file, default \code{NULL}.
#'
#' @return A dataframe
#' @importFrom data.table fread
#' @export
fread_basedict <- function(myfile,mydict,na=c("",".","NA","N/A","NULL"),excludevar=NULL) {
# library(data.table)
vardict <- fread(mydict,colClasses=list(character=c(1:2,4),numeric=c(3)),data.table=FALSE,na.strings=na)
if(!is.null(excludevar)) {
vardict <- vardict[!vardict[,1] %in% excludevar,]
}
ifVar <- which(vardict[,3]==0)
ifVarName <- vardict[,1][ifVar]
vardict <- vardict[vardict[,3]==1,] #IF_VAR==1
charVar <- which(toupper(vardict[,4])=="VARCHAR")
numerVar <- which(toupper(vardict[,4])=="NUMBER")
charVarName <- vardict[,1][charVar]
numerVarName <- vardict[,1][numerVar]
mydata <- fread(myfile,colClasses=list(character=charVarName,numeric=numerVarName,NULL=ifVarName),data.table=FALSE,na.strings=na)
return(mydata)
}
#' @title Convert Types of Variables
#'
#' @description
#' \code{convertType} will convert specified variables in dataset to other types.
#'
#' @details
#' when \code{vars=-1}, which variables to be converted depends on \code{toType}:
#' if \code{toType="fac"}, all the character variables will be converted;
#' if \code{toType="cha"}, all the factor variables will be converted;
#' if \code{toType="int"}, all the numeric variables will be converted.
#'
#' @param df A dataframe to be converted.
#' @param vars Vector of column names or numbers to convert, \code{-1} means to convert all matched variables based on \code{toType} automatically, see details.
#' @param toType The type converted to be, values must be one of \code{c("fac","cha","int")}. If \code{toType="int"}, the converted result is intercepting the integer part of specified variable, not rounding.
#'
#' @return A new dataframe with the same order of variables
#' @importFrom dplyr select
#' @export
#'
#' @examples
#' data(CreditData)
#' sample_fac <- convertType(CreditData, toType="fac")
convertType <- function(df,toType,vars=-1) {
# library(dplyr)
ncl <- ncol(df)
nm <- names(df)
if(length(toType) != 1) stop("the length of parameter 'toType' must be equal to 1")
if(!toType %in% c("fac","cha","int")) stop("parameter 'toType' must be one of c('fac','cha','int')")
if(sum(is.na(vars)) > 0) stop("parameter 'vars' contains NA")
if(is.character(vars)) {
if(sum(!vars %in% nm) > 0) stop(paste(paste(vars[which(!vars %in% nm)],collapse=","),"is not in the variable names of 'df'"))
df.spec <- df[vars]
df.unspec <- df[!nm %in% vars]
if(toType == "fac") {
df.factor <- as.data.frame(sapply(df.spec,as.factor))
res <- cbind(df.unspec,df.factor)
} else if(toType == "cha") {
df.cha <- as.data.frame(sapply(df.spec, as.character), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.cha)
} else {
df.int <- as.data.frame(sapply(df.spec, as.integer), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.int)
}
} else if(is.numeric(vars)) {
if(!all(as.integer(vars) == vars)) stop("parameter 'vars' is float type, not allowed")
if(length(vars) == 1) {
if(vars < 1 & vars != -1) stop("parameter 'vars' is less than 1")
if(vars > ncl) stop("parameter 'vars' is over the number of dataframe's columns")
if(vars == -1) {
if(toType == "fac") {
if(sum(!sapply(df,is.numeric)) == 0) stop("no character variables can be converted to factors")
df.num <- df[sapply(df,is.numeric)]
df.str <- df[!sapply(df,is.numeric)]
df.factor <- as.data.frame(sapply(df.str,as.factor))
res <- cbind(df.num,df.factor)
} else if(toType == "cha") {
if(sum(!sapply(df,is.factor)) == 0) stop("no factors can be converted to character variables")
df.factor <- df[sapply(df,is.factor)]
df.unfac <- df[!sapply(df,is.factor)]
df.cha <- as.data.frame(sapply(df.factor, as.character), stringsAsFactors = FALSE)
res <- cbind(df.unfac,df.cha)
} else {
if(sum(sapply(df,is.numeric)) == 0) stop("no numeric variables can be converted to integer variables")
df.num <- df[sapply(df,is.numeric)]
df.str <- df[!sapply(df,is.numeric)]
df.int <- as.data.frame(sapply(df.num, as.integer), stringsAsFactors = FALSE)
res <- cbind(df.str,df.int)
}
} else {
df.spec <- df[vars]
df.unspec <- df[-vars]
if(toType == "fac") {
df.factor <- as.data.frame(sapply(df.spec,as.factor))
res <- cbind(df.unspec,df.factor)
} else if(toType == "cha") {
df.cha <- as.data.frame(sapply(df.spec, as.character), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.cha)
} else {
df.int <- as.data.frame(sapply(df.spec, as.integer), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.int)
}
}
} else if(length(vars) > 1) {
if(min(vars,na.rm=TRUE) < 1) stop("the min element in 'vars' is less than 1")
if(max(vars,na.rm=TRUE) > ncl) stop("the max element in 'vars' is over the number of dataframe's columns")
df.spec <- df[vars]
df.unspec <- df[-vars]
if(toType == "fac") {
df.factor <- as.data.frame(sapply(df.spec,as.factor))
res <- cbind(df.unspec,df.factor)
} else if(toType == "cha") {
df.cha <- as.data.frame(sapply(df.spec, as.character), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.cha)
} else {
df.int <- as.data.frame(sapply(df.spec, as.integer), stringsAsFactors = FALSE)
res <- cbind(df.unspec,df.int)
}
} else {
stop("the length of vector 'vars' must be more than or equal to 1")
}
} else {
stop("parameter 'vars' must be an integer or character vector")
}
res <- select(res,nm) #sort the variables
return(res)
}
#' @title Splitting Dataset into Train and Test Set
#'
#' @description
#' \code{splitData} will split the dataset into train and test set by given sampling fraction or quantity, then return a list.
#'
#' @param df A dataframe to be splitted.
#' @param size A numeric. Specifies the sampling fraction or quantity of train set.
#' @param ifpercent Logical, indicating whether the sampling fraction or quantity to be specified.
#'
#' @return A list contains train and test set, and train set is the fisrt.
#' @export
#'
#' @examples
#' data(CreditData)
#' splitresult <- splitData(CreditData, size = 0.7, ifpercent = TRUE)
#' train <- splitresult[[1]]
#' test <- splitresult[[2]]
splitData <- function(df,size,ifpercent=TRUE) {
nrw <- nrow(df)
if(!is.numeric(size)) stop("the parameter 'size' must be a numeric")
if(length(size) != 1) stop("the length of parameter 'size' must be 1")
if(ifpercent) {
if(size < 0 | size > 1) stop("the sampling fraction must between 0 and 1")
size <- round(nrw * size)
} else {
if(size < 1 | size > nrw) stop(paste("the sampling quantity must between 1 and ",nrw,sep=""))
size <- round(size)
}
id <- 1:nrw
id.train <- sample(id, size, replace = FALSE)
id.test <- id[!id %in% id.train]
train <- df[id.train,]
test <- df[id.test,]
return(list(train,test))
}
#' @title Logistic Regression Model Training by Stepwise
#'
#' @description
#' \code{LRfit} is a simple wrapper of \code{\link{glm}} for logistic regression, you only need to specify the data and significance-level for training.
#'
#' @param df A dataframe only with Xs and Y variable, all the category X variables must have been encoded(such as WOE, One-Hot and Dumb-Variable encoding).
#' @param sig A numeric of significance-level.
#'
#' @return Same as \code{\link{glm}}
#' @importFrom stats as.formula
#' @importFrom stats glm
#' @importFrom stats binomial
#' @importFrom stats step
#' @importFrom broom tidy
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @export
LRfit <- function(df,sig=0.05) {
nm <- names(df)
ncl <- ncol(df)
if(ncl<2) stop("the ncol of 'df' must be least 2")
fmla <- as.formula(paste(nm[ncl]," ~ .",sep=""))
fit <- glm(fmla,data=df,family=binomial())
step_fit <- step(fit,direction="both")
step_df <- tidy(step_fit)
step_df <- step_df[-1,]
if(max(step_df$p.value) >= sig | min(step_df$estimate) <=0) {
step_df <- filter(step_df, p.value < sig & estimate > 0)
filvars <- step_df[,1]
df <- select(df,c(filvars,nm[ncl]))
return(LRfit(df, sig))
} else {
return(step_fit)
}
}
#' @title Auxiliary Function: Extract Elements From Fit Object
#'
#' @description
#' Auxiliary function: \code{extfromFit} will extract elements from a \code{fit} object, then return a vector.
#'
#' @details
#' parameter \code{fit} can be generated by function \code{\link{LRfit}} or \code{\link{glm}}.
#' if element 'term' is extracted, the extracting result already contains '(Intercept)' at the first one, please remember to remove it if you just need variable names.
#'
#' @param fit A fitted object of class inheriting from \code{\link{LRfit}} or \code{\link{glm}}.
#' @param element Name character of element to be extracted, can be one of c("term","estimate","std.error","statistic","p.value").
#'
#' @return A vector
#' @importFrom broom tidy
#' @export
extfromFit <- function(fit,element) {
# library(broom)
ele_vec <- c("term","estimate","std.error","statistic","p.value")
if(!element %in% ele_vec) stop("parameter 'element' must be one of c('term','estimate','std.error','statistic','p.value')")
fit_df <- tidy(fit)
result <- fit_df[,element]
return(result)
}
#' @title Logistic Regression Model Predicting
#'
#' @description
#' \code{LRpredict} is a wrapper of \code{\link{predict}}, which can predict probabilities on train or test set. Different from \code{\link{rawPredictFun_df}}, it does not need to give model coefficient and woe configuration list, but the given \code{newdata} must be woe-encoded in advance when predicts on test.
#'
#' @details
#' parameter \code{fit} can be generated by function \code{\link{LRfit}} or \code{\link{glm}}.
#'
#' @param fit A fitted object of class inheriting from \code{\link{LRfit}} or \code{\link{glm}}.
#' @param newdata A new dataset to be predicted, which only contain x variables, and must have been woe-encoded. Default \code{NULL}, means predicting on train.
#'
#' @return A numeric vector
#' @family model prediction functions
#' @importFrom stats predict
#' @importFrom broom tidy
#' @export
LRpredict <- function(fit,newdata=NULL) {
if(is.null(newdata)) {
p <- predict(fit,type="response")
} else {
fit_df <- as.data.frame(tidy(fit), stringsAsFactors = FALSE)
filvars <- fit_df[-1,1]
newdata <- newdata[filvars]
p <- predict(fit,newdata=newdata,type="response")
}
return(p)
}
# @title Auxiliary Function: Predict p Value of A Raw Observation
#
# @description
# Auxiliary function: \code{rawPredictFun} will predict the p value of an observation, based on woe configuration list(see details) and model coefficients. Different from \code{\link{LRpredict}}, this function can predict on one raw observation directly which have not been woe-encoded or binned.
#
# @details
# parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#
# @param x A data frame with only one row, which is the observation to be predicted, and it only contain X variables.
# @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a X variable.
# @param coef_vec A vector of model coefficients.
#
# @return A numeric
# @family model prediction functions
# @export
rawPredictFun <- function(x,config_list,coef_vec) {
nclx <- ncol(x)
len_conf <- length(config_list)
if(nclx != len_conf) stop("the length of 'config_list' is not equal to the ncol of 'x'")
woe_vec <- numeric(nclx + 1)
woe_vec[1] <- 1
for(i in 1:nclx) {
woedf <- config_list[[i]]
if(is.na(x[1,i])) {
na_loc <- which(is.na(woedf[,1]))
if(length(na_loc) == 1) {
woe_vec[i+1] <- woedf[na_loc,2]
} else if(length(na_loc) == 0) {
stop(paste("there is a value '",x[1,i],"' of the ",i,"th variable not in bins of 'config_list'",sep=""))
} else stop(paste("the ",i,"th element of 'config_list' is illegal",sep=""))
} else if(is.numeric(x[,i])) {
bin_loc <- which(woedf[,1] >= x[1,i])
if(length(bin_loc) >= 1) {
woe_vec[i+1] <- woedf[bin_loc[1],2]
} else stop(paste("there is a value '",x[1,i],"' of the ",i,"th variable not in bins of 'config_list'",sep=""))
} else {
bin_loc <- which(woedf[,1] == x[1,i])
if(length(bin_loc) == 1) {
woe_vec[i+1] <- woedf[bin_loc,2]
} else if(length(bin_loc) == 0) {
stop(paste("there is a value '",x[1,i],"' of the ",i,"th variable not in bins of 'config_list'",sep=""))
} else stop(paste("the ",i,"th element of 'config_list' is illegal",sep=""))
}
}
p <- 1/(1 + exp(-sum(coef_vec*woe_vec)))
return(p)
}
#' @title Predict p Value of Entire Raw Dataset
#'
#' @description
#' \code{rawPredictFun_df} will predict the p value of all observations, based on woe configuration list(see details) and model coefficients. Different from \code{\link{LRpredict}}, this function can predict on raw observations directly which have not been woe-encoded or binned.
#'
#' @details
#' parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#'
#' @param x_sample A dataframe with only X variables and contains all the observations to be predicted.
#' @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a x variable.
#' @param coef_vec A vector of model coefficients.
#'
#' @return A numeric vector
#' @family model prediction functions
#' @importFrom plyr alply
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @export
rawPredictFun_df <- function(x_sample,config_list,coef_vec) {
# library(plyr)
# library(parallel)
x_sample_list <- alply(x_sample,1,.parallel=TRUE)
clnum <- detectCores()
cl <- makeCluster(getOption("cl.cores",clnum))
p <- parSapply(cl,x_sample_list,rawPredictFun,config_list=config_list,coef_vec=coef_vec)
stopCluster(cl)
return(p)
}
#' @title Transform Prediction to Standard Score
#'
#' @description
#' \code{transformScore} will transform the predicted probability to standard score by giving base score and scale, then return a integer vector.
#'
#' @details
#' parameter \code{theta0} must be the bad:good of train at any time.
#'
#' @param p A numeric vector, the predicted probability.
#' @param theta0 A numeric, the base ratio of bad:good in train.
#' @param P0 An integer, the base score, default \code{600}.
#' @param PDO An integer, the scale, default \code{50}.
#'
#' @return A integer vector
#' @family model prediction functions
#' @export
transformScore <- function(p,theta0,P0=600,PDO=50) {
B <- PDO/log(2)
A <- P0 + B*log(theta0)
score <- A - B*log(p/(1-p))
return(round(score))
}
# @title Auxiliary Function: Encode One Observation of A Variable as WOE
#
# @description
# Auxiliary function: \code{woeEncodeFun_xval} will encode one observation of a variable as woe value based on \code{woedf}, then return a numeric.
#
# @details
# parameter \code{woedf} is a component of woe configuration list which generated by function \code{\link{dfBinningFun}}.
#
# @param xval An observation value of a variable.
# @param woedf A datafrme corresponding to current variable, which is a component of woe configuration list.
#
# @return A numeric
# @family dataset binning and woe-encoding functions
# @export
woeEncodeFun_xval <- function(xval,woedf) {
if(is.na(xval)) {
na_loc <- which(is.na(woedf[,1]))
if(length(na_loc) == 1) {
return(woedf[na_loc,2])
} else if(length(na_loc) == 0) {
stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else stop("the 'woedf' is illegal")
} else if(is.numeric(xval)) {
bin_loc <- which(woedf[,1] >= xval)
if(length(bin_loc) >= 1) {
return(woedf[bin_loc[1],2])
} else stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else {
bin_loc <- which(woedf[,1] == xval)
if(length(bin_loc) == 1) {
return(woedf[bin_loc,2])
} else if(length(bin_loc) == 0) {
stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else stop("the 'woedf' is illegal")
}
}
# @title Auxiliary Function: Encode A Variable as WOE
#
# @description
# Auxiliary function: \code{woeEncodeFun} will encode a variable as woe values, then return a numeric vector.
#
# @details
# parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#
# @param x_sample A dataframe contains the specified variable named \code{xnm}.
# @param xnm A name character of specified variable.
# @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a x variable.
#
# @return A numeric vector
# @family dataset binning and woe-encoding functions
# @export
woeEncodeFun <- function(x_sample,xnm,config_list) {
x_vec <- x_sample[,xnm]
woedf <- config_list[[xnm]]
woe_vec <- sapply(x_vec,woeEncodeFun_xval,woedf=woedf)
return(woe_vec)
}
#' @title Encode Dataset as WOE
#'
#' @description
#' \code{woeEncodeFun_df} will encode all observations as woe values, then return a new dataframe.
#'
#' @details
#' parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#'
#' @param x_sample A dataframe with only X variables and contains all the observations to be encoded.
#' @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a x variable.
#'
#' @return A dataframe after woe-encoded
#' @family dataset binning and woe-encoding functions
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @export
woeEncodeFun_df <- function(x_sample,config_list) {
# library(parallel)
xnms <- names(x_sample)
clnum <- detectCores()
cl <- makeCluster(getOption("cl.cores",clnum))
ori_env <- environment(woeEncodeFun_xval)
clusterExport(cl,"woeEncodeFun_xval",envir = ori_env)
x_sample_encoding <- parSapply(cl,xnms,woeEncodeFun,x_sample=x_sample,config_list=config_list,USE.NAMES = TRUE)
stopCluster(cl)
return(as.data.frame(x_sample_encoding))
}
# @title Auxiliary Function: Execute Binning on One Observation of A Variable
#
# @description
# Auxiliary function: similar to \code{\link{woeEncodeFun_xval}}, based on \code{woedf} to execute binning on one observation of a variable, but will not proceed woe-encode.
#
# @details
# parameter \code{woedf} is a component of woe configuration list which generated by function \code{\link{dfBinningFun}}.
#
# @param xval An observation value of a variable.
# @param woedf A datafrme corresponding to current variable, which is a component of woe configuration list.
#
# @return A character
# @family dataset binning and woe-encoding functions
# @export
executeBinFun_xval <- function(xval,woedf) {
if(is.na(xval)) {
na_loc <- which(is.na(woedf[,1]))
if(length(na_loc) == 1) {
return(NA)
} else if(length(na_loc) == 0) {
stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else stop("the 'woedf' is illegal")
} else if(is.numeric(xval)) {
bin_loc <- which(woedf[,1] >= xval)
if(length(bin_loc) >= 1) {
j <- bin_loc[1]
if(j==1) {
bin_val <- paste("(low,",woedf[j,1],"]",sep="")
} else if(woedf[j,1]==Inf) {
bin_val <- paste("(",woedf[j-1,1],",high)",sep="")
} else {
bin_val <- paste("(",woedf[j-1,1],",",woedf[j,1],"]",sep="")
}
return(bin_val)
} else stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else {
bin_loc <- which(woedf[,1] == xval)
if(length(bin_loc) == 1) {
return(woedf[bin_loc,1])
} else if(length(bin_loc) == 0) {
stop(paste("the value '",xval,"' is not in bins of 'woedf'",sep=""))
} else stop("the 'woedf' is illegal")
}
}
# @title Auxiliary Function: Execute Binning on A Variable
#
# @description
# Auxiliary function: similar to \code{\link{woeEncodeFun}}, based on \code{config_list} to execute binning on a variable, but will not proceed woe-encode.
#
# @details
# parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#
# @param x_sample A dataframe contains the specified variable named \code{xnm}.
# @param xnm A name character of specified variable.
# @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a x variable.
#
# @return A character vector
# @family dataset binning and woe-encoding functions
# @export
executeBinFun <- function(x_sample,xnm,config_list) {
x_vec <- x_sample[,xnm]
woedf <- config_list[[xnm]]
bin_vec <- sapply(x_vec,executeBinFun_xval,woedf=woedf)
return(bin_vec)
}
#' @title Execute Binning on Dataset
#'
#' @description
#' \code{executeBinFun_df} is similar to \code{\link{woeEncodeFun_df}}, based on \code{config_list} to execute binning on all observations, but will not proceed woe-encode.
#'
#' @details
#' parameter \code{config_list} can be generated by function \code{\link{dfBinningFun}}.
#'
#' @param x_sample A dataframe with only X variables and contains all the observations to be encoded.
#' @param config_list A list of woe configuration, whose every component is a dataframe corresponding to a x variable.
#'
#' @return A dataframe after binned
#' @family dataset binning and woe-encoding functions
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @export
executeBinFun_df <- function(x_sample,config_list) {
# library(parallel)
xnms <- names(x_sample)
clnum <- detectCores()
cl <- makeCluster(getOption("cl.cores",clnum))
ori_env <- environment(executeBinFun_xval)
clusterExport(cl,"executeBinFun_xval",envir = ori_env)
x_sample_binning <- parSapply(cl,xnms,executeBinFun,x_sample=x_sample,config_list=config_list,USE.NAMES = TRUE)
stopCluster(cl)
return(as.data.frame(x_sample_binning))
}
#' @title Auxiliary Function: Improved Equal-Frequency Binning
#'
#' @description
#' Auxiliary function: \code{getEqualFreqCuts} will get the improved Equal-Frequency binning cut points, return a vector.
#'
#' @param x A numeric vector you want to bin.
#' @param n The number of bins for Equal-Frequency binning.
#'
#' @return A vector
#' @importFrom stats quantile
#' @export
#'
#' @examples
#' x <- c(1,2,3,3,3,3,3,4,6)
#' as.vector(quantile(x, probs=seq(0,1,1/3)))
#' getEqualFreqCuts(x,3)
getEqualFreqCuts <- function(x,n) {
cuts <- as.vector(quantile(x, probs=seq(0,1,1/n), na.rm=TRUE))
cuts_uniq <- unique(cuts)
if(length(cuts) > length(cuts_uniq)) {
cuts_rep <- as.numeric(names(which(table(cuts)>1)))
suppressWarnings( #suppress the Warning message
cuts_add <- sapply(cuts_rep,function(elemnt) {return(max(x[which(x<elemnt)],na.rm=TRUE))})
)
cuts <- sort(unique(c(cuts_uniq,cuts_add)))
}
return(cuts)
}
#' @title Auxiliary Function: Optimally Binning of Given Variable
#'
#' @description
#' Auxiliary function: \code{smbinning2} is an enhanced and integrated optimal binning function for score model, which contains 4 different binning methods(see details) for numeric and factor variables.
#'
#' @details
#' the last variable in \code{df} must be binary response variable (0,1). Integer (int) is required. Name of y must not have a dot. Name "default" is not allowed.
#' \code{binMethod=c(1,2,3,4)}, meanings:
#' 1 means optimal binning, and equal-frequency binning is an alternative when optimal binning is not available.
#' 2 means optimal binning, and equal-interval binning is an alternative when optimal binning is not available.
#' 3 means equal-frequency binning.
#' 4 means equal-interval binning.
#' when \code{x} represents a continuous variable: At least 5 different values(excluding NA). Value Inf is not allowed.
#' when \code{x} represents a factor variable: At least 2 different values(excluding NA). Value Inf is not allowed.
#'
#' @param df A dataframe only with Xs and Y variables, and the last variable must be Y, see details.
#' @param x A name character of one x variable(if x variable is a character variable, it must be converted to factor in advance), \code{x} must not have a dot, see details.
#' @param binMethod An integer from 1 to 4, indicates 4 different binning methods(see details).
#' @param p A numeric, means percentage of records per bin, from 0 to 0.5.
#' @param maxcat An integer, specifies the maximum number of categories.
#' @param aliquots An integer, specifies the number of bins for equal-frequency or equal-interval binning method.
#'
#' @return A dataframe with cutpoints and corresponding woes
#' @family dataset binning and woe-encoding functions
#' @importFrom smbinning smbinning
#' @importFrom smbinning smbinning.custom
#' @importFrom smbinning smbinning.factor
#' @export
#'
#' @examples
#' data(CreditData)
#' mysample <- convertType(CreditData, toType="fac")
#' splitresult <- splitData(mysample, size = 0.7, ifpercent = TRUE)
#' train <- splitresult[[1]]
#' test <- splitresult[[2]]
#' smbinning2(train, x = "bscore", binMethod = 1, p = 0.05, maxcat = 10, aliquots = 5)
smbinning2 <- function(df,x,binMethod,p,maxcat,aliquots) {
# library(smbinning)
nm <- names(df)
ncl <- ncol(df)
mydf <- df[c(x,nm[ncl])]
if(is.numeric(mydf[,1])) { #if x is a numeric variable
if(binMethod==1) { #optimal binning, and equal-frequency binning is an alternative when optimal binning is not available
res <- smbinning(mydf,y=nm[ncl],x=x,p=p)
if(is.data.frame(res[[1]])) { #if x can be Optimally Binned
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
} else {
cuts <- getEqualFreqCuts(mydf[,1],aliquots) #improved equal-frequency binning
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=x,cuts=cuts)
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
}
} else if(binMethod==2) { #optimal binning, and equal-interval binning is an alternative when optimal binning is not available
res <- smbinning(mydf,y=nm[ncl],x=x,p=p)
if(is.data.frame(res[[1]])) { #if x can be Optimally Binned
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
} else {
xmin <- min(mydf[,1],na.rm=TRUE)
xmax <- max(mydf[,1],na.rm=TRUE)
cuts <- seq(xmin,xmax,length.out=aliquots + 1)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=x,cuts=cuts)
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
}
} else if(binMethod==3) { #equal-frequency binning
cuts <- getEqualFreqCuts(mydf[,1],aliquots)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=x,cuts=cuts)
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
} else if(binMethod==4) { #equal-interval binning
xmin <- min(mydf[,1],na.rm=TRUE)
xmax <- max(mydf[,1],na.rm=TRUE)
cuts <- seq(xmin,xmax,length.out=aliquots + 1)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=x,cuts=cuts)
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
x_vals <- c(x_vals,Inf)
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,Inf,NA)
}
}
} else { #if x is a factor variable
if(length(table(mydf[x])) < 2) stop(paste("error in 'smbinning2' Function: '",x,"' variable with less than 2 different values(excluding NA)",sep=""))
res <- smbinning.factor(mydf,y=nm[ncl],x=x,maxcat=maxcat)
if(is.data.frame(res[[1]])) {
x_woe <- res$ivtable$WoE
len_woe <- length(x_woe)
x_vals <- res$cuts
len_cuts <- length(x_vals)
if(sum(is.na(mydf[,1]))==0) {
x_woe <- x_woe[1:(len_woe-2)]
} else {
x_woe <- x_woe[1:(len_woe-1)]
x_vals <- c(x_vals,NA)
}
} else {
x_woe <- 0
x_vals <- "NULL"
}
}
res <- data.frame(vals=x_vals,woe=x_woe,stringsAsFactors=FALSE)
#I'm not sure if it has any impact when set stringsAsFactors=FALSE above
return(res)
}
#' @title Generate WOE Configuration List Automatically
#'
#' @description
#' \code{dfBinningFun} is a wrapper and parallelization of \code{\link{smbinning2}}, so it can generate the woe configuration list by using optimal binning automatically.
#'
#' @details
#' woe configuration list is a list object lengths as number of x variables in \code{mydf}, whose component is a dataframe consisting of two columns, named as 'vals' and 'woe'.
#'
#' @param mydf A dataframe only with Xs and Y variables, the last column must be Y. All character x variables must be converted to factors in advance.
#' @param binMethod An integer from 1 to 4, indicates 4 different binning methods(see details \code{\link{smbinning2}}), default \code{1}.
#' @param p A numeric, means percentage of records per bin, from 0 to 0.5, default \code{0.05}.
#' @param maxcat An integer, specifies the maximum number of categories, default \code{10}.
#' @param aliquots An integer, specifies the number of bins for equal-frequency or equal-interval binning method, default \code{5}.
#'
#' @return A list corresponding to X variables in \code{mydf}
#' @family dataset binning and woe-encoding functions
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterEvalQ
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#' @export
dfBinningFun <- function(mydf,binMethod=1,p=0.05,maxcat=10,aliquots=5) {
if(!binMethod %in% c(1,2,3,4)) stop("error in 'dfBinningFun' Function: the parameter of 'binMethod' is not in c(1,2,3,4)")
if(aliquots<2) stop("error in 'dfBinningFun' Function: the parameter of 'aliquots' is less than 2")
aliquots <- floor(aliquots)
# library(parallel)
ncl <- ncol(mydf)
if(ncl<2) stop("the ncol of 'mydf' must be least 2")
nm <- names(mydf)
x_vec <- nm[-length(nm)]
clnum <- detectCores()
cl <- makeCluster(getOption("cl.cores",clnum))
gEFC_env <- environment(getEqualFreqCuts)
clusterEvalQ(cl,library(smbinning))
clusterExport(cl,"getEqualFreqCuts",envir = gEFC_env)
myres <- parLapply(cl,x_vec,smbinning2,df=mydf,binMethod=binMethod,p=p,maxcat=maxcat,aliquots=aliquots)
stopCluster(cl)
names(myres) <- x_vec
return(myres)
}
#' @title Generate WOE Configuration List Manually
#'
#' @description
#' \code{genConfigList} can be used to generate the woe configuration list by batch computing WOEs with given binnings.
#'
#' @details
#' \code{cut_list} must contains max and Inf value in every continuous-representing component. Component such as \code{c(10,20,30,Inf,NA)}.
#'
#' @param df A dataframe only with Xs and Y variables, and the last variable must be Y. Y must be 0 or 1, with 0 represents good and 1 represents bad.
#' @param cut_list A list of cut points vector corresponding X variables, but for factor variable, the element is fixed to constant 'factor'.
#'
#' @return A list corresponding to X variables in \code{df}
#' @family dataset binning and woe-encoding functions
#' @export
#'
#' @examples
#' data(CreditData)
#' mysample <- CreditData[c("gender","gscore","target")]
#' genConfigList(mysample, cut_list = list("factor",c(534,566,577,617,Inf,NA)))
genConfigList <- function(df,cut_list) {
ncl <- ncol(df)
nrw <- nrow(df)
nm <- names(df)
if(ncl < 2) stop("the ncol of 'df' must be least 2")
if(nrw < 2) stop("the nrow of 'df' must be least 2")
totalY <- as.numeric(table(df[ncl]))
if(length(totalY) != 2) stop("the number of Y values must be equal to 2")
totalgood <- totalY[1]
totalbad <- totalY[2]
config_list <- list()
for(i in 1:(ncl-1)) {
elemt <- cut_list[[i]]
if(length(elemt) == 1 & elemt[1] == "factor") {
uniqueX <- unique(df[,i])
woe_vec <- sapply(uniqueX,getWOE,df=df[c(i,ncl)],totalgood=totalgood,totalbad=totalbad)
config_list[[i]] <- data.frame(vals=uniqueX,woe=woe_vec)
} else if(length(elemt) > 1) {
cutbound_list <- convertCutPoints(elemt)
woe_vec <- sapply(cutbound_list,getWOE,df=df[c(i,ncl)],totalgood=totalgood,totalbad=totalbad)
config_list[[i]] <- data.frame(vals=elemt,woe=woe_vec)
} else {
stop(paste("the ",i,"th element of 'cut_list' is not 'factor' string",sep=""))
}
}
names(config_list) <- nm[-ncl]
return(config_list)
}
#' @title Compute KS Value of Score Model
#'
#' @description
#' \code{myks} will compute KS value by giving target and prediction, then return a numeric.
#'
#' @param y A vector of target, only containing 1 and 0.
#' @param predict_y A vector of p prediction.
#'
#' @return A numeric
#' @family model performance functions
#' @importFrom ROCR prediction
#' @importFrom ROCR performance
#' @export
myks <- function(y,predict_y) {
# library(ROCR)
pred <- prediction(predictions=predict_y,labels=y)
perf <- performance(pred,"tpr","fpr")
ks <- max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
return(ks)
}
#' @title Auxiliary Function: Convert Cut Points to Cutbound
#'
#' @description
#' Auxiliary function: convert vector of cut points to a list of cutbound with lbound and ubound.
#'
#' @param cutpoints A vector with serial cut points, such as \code{c(10,20,30,Inf,NA)}, in this example, Inf and NA is necessary.
#'
#' @return A list length as cutpoints vector
#' @family dataset binning and woe-encoding functions
#' @export
#'
#' @examples
#' convertCutPoints(c(10,20,30,Inf,NA))
convertCutPoints <- function(cutpoints) {
len <- length(cutpoints)
flag <- FALSE
if(sum(is.na(cutpoints)) != 0) {
cutpoints <- cutpoints[-len] #remove NA
len <- len - 1
flag <- TRUE
}
cutbound_list <- list()
for(i in 1:len) {
if(i==1) {
cutbound_list[[1]] <- c(-Inf,cutpoints[1])
} else {
cutbound_list[[i]] <- c(cutpoints[i-1],cutpoints[i])
}
}
if(flag==TRUE) cutbound_list[[len+1]] <- NA
return(cutbound_list)
}
#*******************Plot Model Performance Graphs*********************
#' @title Auxiliary Function: Create the Appropriate Data Structure for Model Performance Visualization
#'
#' @description
#' Auxiliary function: create the appropriate data structure for function \code{\link{myCurves}}, return a list.
#'
#' @details
#' the returned list contains two components: a \code{df} 'data.frame(Quant, PropBad, AccumBad, PropGood, AccumGood, BadRate)' and a numeric \code{ks} value.
#'
#' @param y A vector of target, only containing 1 and 0.
#' @param predict_y A vector of p prediction.
#' @param gap A numeric, the gap of two quantile points, default \code{0.05}.
#'
#' @return A list
#' @family model performance functions
#' @importFrom stats quantile
#' @export
Curve_Data <- function(y, predict_y, gap=0.05) {
cnt <- length(y)
cnt_bad <- sum(y) #y==1 means bad
cnt_good <- cnt - cnt_bad
cutPoints <- as.vector(quantile(predict_y, probs=seq(1,0,-gap), na.rm=TRUE))
len_cutPoints <- length(cutPoints)
accumProp_bad <- numeric()
accumProp_good <- numeric()
Prop_bad <- numeric()
Prop_good <- numeric()
accumBad <- numeric()
Bad <- numeric()
for(i in 1:(length(cutPoints)-1)) {
accumProp_bad[i] <- sum(predict_y >= cutPoints[i+1] & y==1)/cnt_bad
accumProp_good[i] <- sum(predict_y >= cutPoints[i+1] & y==0)/cnt_good
accumBad[i] <- sum(predict_y >= cutPoints[i+1] & y==1)
if(i==1) {
Prop_bad[i] <- accumProp_bad[i]
Prop_good[i] <- accumProp_good[i]
Bad[i] <- accumBad[i]
} else {
Prop_bad[i] <- accumProp_bad[i] - accumProp_bad[i-1]
Prop_good[i] <- accumProp_good[i] - accumProp_good[i-1]
Bad[i] <- accumBad[i] - accumBad[i-1]
}
}
badrate <- Bad/(cnt/(len_cutPoints-1))
accumProp_bad <- c(0, accumProp_bad)
accumProp_good <- c(0, accumProp_good)
Prop_bad <- c(0, Prop_bad)
Prop_good <- c(0, Prop_good)
badrate <- c(0,badrate)
ks <- max(abs(accumProp_bad - accumProp_good))
df <- data.frame(Quant=seq(0,1,gap), PropBad=Prop_bad, AccumBad=accumProp_bad,
PropGood=Prop_good, AccumGood=accumProp_good, BadRate=badrate)
return(list(df=df,ks=ks))
}
#' @title Model Performance Visualization
#'
#' @description
#' \code{myCurves} is an integrated, flexible as well as easy to use function of visualization for model performance.
#'
#' @details
#' the result graph has four subgraphs which constitute a matrix, top left is ROC Curve, top right is Lift Figure, lower left is K-S Curve, and lower right is Distribution Figure of standard score.
#'
#' @param ytrain A vector corresponding the response variable encoded with 0 and 1 of train set.
#' @param predict_ytrain A vector of predicted probability values of train set.
#' @param ytest A vector corresponding the response variable encoded with 0 and 1 of test set.
#' @param predict_ytest A vector of predicted probability values of test set.
#' @param ontest Logical, decide plot on train or on test, default \code{TRUE}.
#' @param lift_bins An integer, set the number of binnings, default \code{10}.
#' @param P0 An integer, the base score, default \code{600}.
#' @param PDO An integer, the scale, default \code{50}.
#' @param color_scheme An integer, choice the color scheme 1, 2 or 3, default scheme \code{1}.
#' @param ifsave Logical, whether to save the chart as a file, default \code{TRUE}.
#'
#' @return NULL
#' @family model performance functions
#' @import ggplot2
#' @importFrom pROC roc
#' @importFrom plotROC geom_roc
#' @importFrom plotROC style_roc
#' @importFrom gridExtra arrangeGrob
#' @importFrom gridExtra grid.arrange
#' @importFrom RColorBrewer brewer.pal
#' @importFrom stats dnorm
#' @importFrom stats sd
#' @export
myCurves <- function(ytrain, predict_ytrain, ytest, predict_ytest,
ontest=TRUE, lift_bins=10, P0=600, PDO=50, color_scheme=1, ifsave=TRUE) {
# suppressPackageStartupMessages(library(ggplot2))
# suppressPackageStartupMessages(library(pROC))
# suppressPackageStartupMessages(library(plotROC))
# suppressPackageStartupMessages(library(gridExtra))
# suppressPackageStartupMessages(library(RColorBrewer))
if(color_scheme!=1 & color_scheme!=2 & color_scheme!=3) stop("Parameter 'color_scheme' must be equal 1, 2 or 3")
colsname <- c("Dark2", "Set1", "Set2")
mycolor <- brewer.pal(8, colsname[color_scheme])
if(ontest) {
y <- ytest
predict_y <- predict_ytest
chartnm <- "Test"
} else {
y <- ytrain
predict_y <- predict_ytrain
chartnm <- "Train"
}
mydata <- data.frame(y, predict_y)
#ROC Curve
roc_curve <- roc(y,predict_y)
Specificity <- roc_curve$specificities
Sensitivity <- roc_curve$sensitivities
p_roc <- ggplot(mydata, aes(d = y, m = predict_y)) +
geom_roc(n.cuts = 0, color = mycolor[1]) +
style_roc(minor.breaks = NULL,guide = TRUE, xlab = "1-Specificity",ylab = "Sensitivity", theme = theme()) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate('text', x = 0.5, y = 0.5, label=paste('AUC=', round(roc_curve$auc,3), sep="")) +
labs(title = paste('ROC Curve of',chartnm))
#Lift Curve
LiftData <- Curve_Data(y,predict_y,1/lift_bins)$df #to cut (1/lift_bins) bins
LiftData <- LiftData[-1,c("Quant","BadRate")]
p_lift <- ggplot(aes(x = Quant, y = BadRate), data = LiftData) +
geom_bar(stat = 'identity', fill = mycolor[2]) +
scale_x_continuous(breaks = seq(0.1,1,0.2)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x = 'Quantile', y = 'BadRate', title=paste('Lift Figure of',chartnm))
#K-S Curve
KSData <- Curve_Data(y,predict_y)
p_ks <- ggplot() +
geom_line(aes(x = Quant, y = AccumBad), data = KSData$df, color = mycolor[3],size = 1.02) +
geom_line(aes(x = Quant, y = AccumGood), data = KSData$df, color = mycolor[4],size = 1.02) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate(geom = 'text', x = 0.5, y = 1, label = paste('KS=', round(myks(y, predict_y),3), sep="")) +
# annotate(geom = 'text', x = 0.5, y = 1, label = paste('KS=', round(KSData$ks,3), sep="")) +
labs(x = 'Quantile', y = 'Cumulative Proportion', title=paste('K-S Curve of',chartnm))
#Histogram of prediction
bad_cnt <- sum(ytrain) #must be ytrain
good_cnt <- length(ytrain) - bad_cnt
mydata$score <- transformScore(mydata$predict_y,bad_cnt/good_cnt,P0=P0,PDO=PDO)
x_dnorm <- seq(min(mydata$score),max(mydata$score),length.out = 100)
dnormVect <- dnorm(x_dnorm, mean = mean(mydata$score, na.rm = TRUE), sd = sd(mydata$score, na.rm = TRUE))
dnormData <- data.frame(dnormVect)
p_hist <- ggplot() +
geom_histogram(aes(x = score, y = ..density..), mydata, bins = 20, fill = mycolor[5], na.rm = TRUE) +
geom_line(aes(x = x_dnorm, y = dnormVect), dnormData, color = mycolor[6]) +
theme(legend.position = 'none', plot.title = element_text(hjust = 0.5)) +
annotate(geom = 'text', x = max(mydata$score) - 0.8*sd(mydata$score), y = max(dnormVect)/2,
label = paste('mean=', round(mean(mydata$score, na.rm = TRUE),0),'\n',
'sd=', round(sd(mydata$score, na.rm = TRUE),0), sep="")) +
labs(x = 'Score', y = 'Density', title=paste('Distribution Figure of',chartnm))
# #Preparing Gain data
# Pi <- table(y)[2]/sum(table(y))
# Ptp <- Pi*Sensitivity
# Pfp <- (1-Pi)*(1-Specificity)
# Depth <- Ptp + Pfp
# PV_Plus <- Ptp/Depth
# Lift <- PV_Plus/Pi
#
# #Gain Curve
# p <- ggplot(data = NULL, mapping = aes(x= Depth, y = PV_Plus))
# p_gain <- p + geom_line(na.rm = TRUE, size = 1.02, color = mycolor[5]) +
# labs(x = 'Depth',y = 'Precision', title = 'Gain Curve') +
# theme(plot.title = element_text(hjust = 0.5))
dt <- as.character(format(Sys.time(), "%Y%m%d%H%M%S"))
if(ifsave) {
ml <- arrangeGrob(p_roc,p_lift,p_ks,p_hist,ncol=2)
ggsave(paste("PerformanceGraphs", dt, ".pdf", sep = ""), ml, width = 8.27, height = 8.27, units = "in")
ggsave(paste("PerformanceGraphs", dt, ".png", sep = ""), ml, width = 8.27, height = 8.27, units = "in")
}
grid.arrange(p_roc,p_lift,p_ks,p_hist,ncol=2)
}
#********************Model Validation*********************
#' @title Cross Validation for Score Model
#'
#' @description
#' \code{crossValidation} is a easy to use function of t times k folds cross validation for score model. The woe will be recomputed at every time and fold validation.
#'
#' @details
#' this function will cost much time if you set \code{t} and \code{k} too large, \code{t=1} and \code{k=10} is recommended.
#'
#' @param df A dataframe only with Xs and Y variables, and the last variable must be Y, and all the X variables have been binned, but have not been woe-encoded.
#' @param t An integer, means t times, default \code{1}.
#' @param k An integer, means k folds, default \code{10}.
#' @param ifprint Logical, whether to print the intermediate results, default \code{TRUE}.
#'
#' @return A vector contains average of train and test ks value
#' @family model performance functions
#' @importFrom caret createFolds
#' @importFrom dplyr select
#' @importFrom stats binomial
#' @importFrom stats glm
#' @importFrom stats predict
#' @export
crossValidation <- function(df,t=1,k=10,ifprint=TRUE) {
# suppressPackageStartupMessages(library(lattice))
# suppressPackageStartupMessages(library(caret))
# suppressPackageStartupMessages(library(dplyr))
ks_train_cv <- matrix(nrow = t,ncol = k)
ks_test_cv <- matrix(nrow = t,ncol = k)
ncl <- ncol(df)
for(i in 1:t) {
if(ifprint) {
print("---times---")
print(i)
cat("\n")
}
folds <- createFolds(y=df[,ncl], k=k)
for(j in 1:k) {
if(ifprint) {
print("***folds***")
print(j)
}
fold_train <- df[-folds[[j]],]
fold_test <- df[folds[[j]],]
config_list_train <- dfBinningFun(fold_train,binMethod=1,p=0.05,maxcat=10,aliquots=5)
x_fold_train <- select(fold_train,-ncl)
x_fold_train_encoding <- woeEncodeFun_df(x_fold_train,config_list_train)
fold_train_encoding <- cbind(x_fold_train_encoding,fold_train[ncl])
x_fold_test <- select(fold_test,-ncl)
x_fold_test_encoding <- woeEncodeFun_df(x_fold_test,config_list_train)
fold_test_encoding <- cbind(x_fold_test_encoding,fold_test[ncl])
fit <- glm(target ~ ., data=fold_train_encoding, family=binomial())
fold_train_encoding$p <- predict(fit,type="response")
ks_train <- myks(fold_train_encoding$target,fold_train_encoding$p)
fold_test_encoding$p <- predict(fit,newdata=x_fold_test_encoding,type="response")
ks_test <- myks(fold_test_encoding$target,fold_test_encoding$p)
ks_train_cv[i,j] <- ks_train
ks_test_cv[i,j] <- ks_test
if(ifprint) {
print(paste("the KS of train sample:",round(ks_train,4)))
print(paste("the KS of test sample:",round(ks_test,4)))
cat("\n")
}
}
}
ks_train_avg <- mean(ks_train_cv)
ks_test_avg <- mean(ks_test_cv)
print(paste("the mean KS of train sample in CV:",round(ks_train_avg,4)))
print(paste("the mean KS of test sample in CV:",round(ks_test_avg,4)))
return(c(ks_train_avg,ks_test_avg))
}
#' @title Auxiliary Function: Insert Elements into Vector
#'
#' @description
#' Auxiliary function: \code{insertElement} can insert specified elements into the specified location of vector, then return a new vector.
#'
#' @details
#' the elements in \code{indexs} must be sorted by ascending.
#'
#' @param vec A vector to be inserted into.
#' @param indexs A vector consisting of location indexs corresponding to the location of new elements after inserted.
#' @param values A vector to be inserted into parameter \code{vec}.
#'
#' @return A vector
#' @export
#'
#' @examples
#' insertElement(c(6,8,10),c(2,4),c(7,9))
insertElement <- function(vec,indexs,values) {
#limit the mode of 'indexs'
if(!is.numeric(indexs)) {
stop("parameter 'indexs' must be numeric")
} else {
indexs2 <- as.integer(indexs)
if(!all(indexs == indexs2)) stop("parameter 'indexs' must be integer, float is not allowed")
}
#check whether the modes of 'vec' and 'values' are consistent
if(!is.numeric(vec) | !is.numeric(values)) {
if(class(vec) != class(values)) stop("the mode of parameter 'vec' and 'values' must be consistent")
}
len1 <- length(indexs)
len2 <- length(values)
if(len1 != len2) stop("the lengths of parameter 'indexs' and 'values' must be equal")
if(len1 > 0) {
for(i in 1:len1) {
len <- length(vec)
ind <- indexs[i]
val <- values[i]
if(ind == 1) vec <- c(val,vec)
else if(ind > 1 & ind <= len) vec <- c(vec[1:(ind-1)],val,vec[ind:len])
else vec <- c(vec,val)
}
}
return(vec)
}
#' @title Compute the PSI Index of Score Model
#'
#' @description
#' \code{psi} is an efficient and easy to use function computing model's PSI index(see details), return a numeric.
#'
#' @details
#' generally, when PSI < 0.1, model is highly stable; when 0.1-0.25, the stability of model is general; when > 0.25, the stability of model is bad, the model must be rebuilt.
#'
#' @param p1 A vector of prediction in train data.
#' @param p2 A vector of prediction in test data.
#' @param bins An integer, set the number of binnings, default \code{10}.
#' @param binMethod A character string, specify the binning method, must be one of \code{c("EF","EI")}, "EF" means equal-frequency(default), "EI" means equal-interval.
#'
#' @return A numeric
#' @family model stability functions
#' @export
psi <- function(p1, p2, bins=10, binMethod="EF") {
if(!binMethod %in% c("EF","EI")) stop("binMethod must be 'EF' or 'EI'")
virtual <- 0.000001 #use a minima to replace zero avoiding calculation of psi failure
bin_index <- 1:bins
n1 <- length(p1)
n2 <- length(p2)
p1 <- sort(p1)
p2 <- sort(p2)
min_p <- min(p1, p2, na.rm = TRUE)
max_p <- max(p1, p2, na.rm = TRUE)
if(binMethod=="EF") {
quantpoints <- as.vector(quantile(p1, probs=seq(0, 1, length.out = bins + 1), na.rm=TRUE))
cutpoints <- c(min_p, quantpoints[2:(length(quantpoints) - 1)], max_p)
} else {
cutpoints <- seq(min_p, max_p, length.out = bins + 1) #if len(cutpoints)=11, number of bins=10
}
p1_bin <- cut(p1, cutpoints, labels = FALSE, include.lowest = TRUE)
p2_bin <- cut(p2, cutpoints, labels = FALSE, include.lowest = TRUE)
p1_missbin <- bin_index[!bin_index %in% p1_bin] #important step
p2_missbin <- bin_index[!bin_index %in% p2_bin] #important step
p1_group_rate <- as.vector(table(p1_bin))/n1
p2_group_rate <- as.vector(table(p2_bin))/n2
if(length(p1_missbin) > 0) p1_group_rate <- insertElement(p1_group_rate,p1_missbin,rep(virtual,length(p1_missbin)))
if(length(p2_missbin) > 0) p2_group_rate <- insertElement(p2_group_rate,p2_missbin,rep(virtual,length(p2_missbin)))
mypsi <- sum((p2_group_rate - p1_group_rate) * log(p2_group_rate/p1_group_rate))
return(mypsi)
}
#********************Data Preprocessing*********************
#' @title Auxiliary Function: Exclude Columns in Data Frame
#'
#' @description
#' Auxiliary function: \code{excludeCol} will exclude the given columns, then return a new dataframe.
#'
#' @param df A dataframe.
#' @param exclude Vector of column names or numbers to exclude.
#'
#' @return A dataframe
#' @family data preprocessing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' x_sample <- excludeCol(CreditData, exclude = "target")
excludeCol <- function(df,exclude) {
ncl <- ncol(df)
nm <- names(df)
if(sum(is.na(exclude)) > 0) stop("parameter 'exclude' contains NA")
if(is.character(exclude)) {
if(sum(!exclude %in% nm) > 0) stop(paste(paste(exclude[which(!exclude %in% nm)],collapse=","),"is not in the variable names of 'df'"))
df <- df[!nm %in% exclude]
} else if(is.numeric(exclude)) {
if(!all(as.integer(exclude) == exclude)) stop("parameter 'exclude' is float type, not allowed")
if(min(exclude,na.rm=TRUE) < 1) stop("the min element in 'exclude' is less than 1")
if(max(exclude,na.rm=TRUE) > ncl) stop("the max element in 'exclude' is over the number of dataframe's columns")
df <- df[-exclude]
} else {
stop("parameter 'exclude' must be an integer or character vector")
}
return(df)
}
#' @title Compute and Delete NA Rate of Variable
#'
#' @description
#' \code{delNArate} will compute NA rate of every variable in \code{df} firstly, then delete variables whose na rate is greater than or equal to threshold, return a list contains deleted dataframe.
#'
#' @details
#' the returned list contains three components, one is dataframe named 'naratedf', second is a dataframe named 'delVardf', last one is an after-deleted dataframe.
#'
#' @param df A dataframe.
#' @param narate_critical A numeric, Specifies NA rate threshold for selecting and returning variables, default \code{0.9}.
#' @param exclude Vector of column names or numbers to exclude, default \code{NULL}.
#'
#' @return A list
#' @family data preprocessing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' result1 <- delNArate(CreditData, narate_critical = 0.7)
#' result2 <- delNArate(CreditData, narate_critical = 0.7, exclude = "multiloantimes")
#' mysample <- result2[[3]]
delNArate <- function(df,narate_critical=0.9,exclude=NULL) {
df2 <- df
nm2 <- names(df2)
if(!is.null(exclude)) df <- excludeCol(df,exclude)
nrw <- nrow(df)
nm <- names(df)
# cntna <- apply(df,2,countNA)
cntna <- apply(df,2,function(x) sum(is.na(x))) #Computing the NA percentage of variables
narate <- cntna / nrw
#be carefull! character vectors be converted to factors in 'data.frame' function
naratedf <- data.frame(VarName=nm, NARate=narate, row.names=NULL, stringsAsFactors=FALSE)
delVardf <- naratedf[naratedf$NARate >= narate_critical,]
delvars <- delVardf[,1]
delflag <- nm2 %in% delvars
keepdata <- df2[!delflag]
return(list(naratedf,delVardf,keepdata))
}
# @title Auxiliary Function: Get the Max Percent of the Given Variable's Single-Value
#
# @description
# Auxiliary function: return the max percent of Single-Value in the given x variable.
#
# @param df A dataframe.
# @param x_nm Name of x variable.
#
# @return A numeric
# @family data preprocessing functions
# @export
#
# @examples
# data(CreditData)
# maxSinvalPercent_x(CreditData, "gender")
maxSinvalPercent_x <- function(df,x_nm) {
x <- df[,x_nm]
len <- length(x)
if(len==0) stop(paste("error in 'maxSinvalPercent_x' Function: vector '",x_nm,"' contains zero element",sep=""))
cnts <- as.vector(table(x,useNA="ifany"))
percents <- cnts/len
return(max(percents,na.rm=TRUE))
}
#' @title Get the Max Percents of All Variable's Single-Value
#'
#' @description
#' \code{maxSinvalPercent} will return the max percent vector of every variable's Single-Value in \code{df}.
#'
#' @param df A dataframe.
#'
#' @return A vector
#' @family data preprocessing functions
#' @export
maxSinvalPercent <- function(df) {
nm <- names(df)
myres <- sapply(nm,maxSinvalPercent_x,df=df)
return(myres)
}
#' @title Delete Variables Based on Single-Value Percent
#'
#' @description
#' \code{delSinvalPercent} will delete variables whose single-value percent is more than or equal to the given threshold, then return a new dataframe.
#'
#' @details
#' generally, in score model, the threshold of single value percent is often set to 0.9.
#'
#' @param df A dataframe.
#' @param percent The given threshold, default \code{0.9}.
#' @param exclude Vector of column names or numbers to exclude, default \code{NULL}.
#'
#' @return A dataframe after deleted
#' @family data preprocessing functions
#' @export
delSinvalPercent <- function(df,percent=0.9,exclude=NULL) {
df2 <- df
nm2 <- names(df2)
if(!is.null(exclude)) df <- excludeCol(df,exclude)
nm <- names(df)
sinvalper <- maxSinvalPercent(df)
delvars <- nm[which(sinvalper >= percent)]
delflag <- nm2 %in% delvars
return(df2[!delflag])
}
#' @title Delete Variables Based on Number of Different Values
#'
#' @description
#' \code{delFewValues} will delete variables whose number of different values is too few(see details) to use binning function, and return a new dataframe.
#'
#' @details
#' for numeric variable, number of different values must be 5 at least, excluding missing values;
#' for factor variable, must be 2 at least, excluding missing values also.
#'
#' @param df A dataframe.
#' @param minN Min number for numeric variable, default \code{5}.
#' @param minF Min number for factor variable, default \code{2}.
#' @param exclude Vector of column names or numbers to exclude, default \code{NULL}.
#'
#' @return A dataframe after deleted
#' @family data preprocessing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' mysample <- delFewValues(CreditData, minN = 5, minF = 2, exclude = "target")
delFewValues <- function(df,minN=5,minF=2,exclude=NULL) {
df2 <- df
nm2 <- names(df2)
if(!is.null(exclude)) df <- excludeCol(df,exclude)
nm <- names(df)
numVal <- sapply(df,FUN = function(x) {return(length(table(x)))})
varType <- sapply(df,is.numeric)
delvars <- nm[which((varType & numVal < minN) | (!varType & numVal < minF))]
delflag <- nm2 %in% delvars
return(df2[!delflag])
}
#' @title Compute the Given Binning WOE
#'
#' @description
#' \code{getWOE} will compute the given binning WOE of x variable, return a numeric.
#'
#' @param df A dataframe only with two variables, X and Y.
#' @param cutbound A single value for factor variable, or a vector with lbound and ubound for numeric variable.
#' @param totalgood Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#' @param totalbad Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#'
#' @return a numeric
#' @family WOE and IV computing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' getWOE(CreditData[c("gender","target")], cutbound = "M")
#' getWOE(CreditData[c("gscore","target")], cutbound = c(-Inf,534))
#' getWOE(CreditData[c("gscore","target")], cutbound = c(577,617))
#' getWOE(CreditData[c("gscore","target")], cutbound = c(617,Inf))
#' getWOE(CreditData[c("gscore","target")], cutbound = NA)
getWOE <- function(df,cutbound,totalgood=NULL,totalbad=NULL) {
ncl <- ncol(df)
nm <- names(df)
if(ncl != 2) stop("error in 'getWOE' Function: the ncol of 'df' must be equal to 2")
totalY <- as.numeric(table(df[2]))
if(length(totalY) != 2) stop("error in 'getWOE' Function: the number of Y values must be equal to 2")
if(is.null(totalgood)) totalg <- totalY[1] else totalg <- totalgood
if(is.null(totalbad)) totalb <- totalY[2] else totalb <- totalbad
X <- df[,1]
Y <- df[,2]
len_cut <- length(cutbound)
if(len_cut==1) {
if(is.na(cutbound)) {
totalY_sub <- as.numeric(table(Y[is.na(X)]))
} else {
totalY_sub <- as.numeric(table(Y[X==cutbound]))
}
if(length(totalY_sub)==1) stop(paste("purely binning error in 'getWOE' Function. variable:'",nm[1],"', binning:'=",cutbound,"'",sep=""))
Good <- totalY_sub[1]
Bad <- totalY_sub[2]
} else if(len_cut==2) {
lb <- cutbound[1]
ub <- cutbound[2]
totalY_sub <- as.numeric(table(Y[X>lb & X<=ub]))
if(length(totalY_sub)==1) stop(paste("purely binning error in 'getWOE' Function. variable:'",nm[1],"', binning:(",lb,",",ub,"]",sep=""))
Good <- totalY_sub[1]
Bad <- totalY_sub[2]
} else {
stop("error in 'getWOE' Function: the length of parameter 'cutbound' is not 1 or 2")
}
WOE <- log((Bad/Good)/(totalb/totalg))
return(WOE)
}
#' @title Compute the Given Binning IV
#'
#' @description
#' \code{getIV} will compute the given binning IV of x variable, return a numeric.
#'
#' @param df A dataframe only with two variables, X and Y.
#' @param cutbound A single value for factor variable, or a vector with lbound and ubound for numeric variable.
#' @param totalgood Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#' @param totalbad Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#'
#' @return A numeric
#' @family WOE and IV computing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' getIV(CreditData[c("gender","target")], cutbound = "M")
#' getIV(CreditData[c("gscore","target")], cutbound = c(-Inf,534))
#' getIV(CreditData[c("gscore","target")], cutbound = c(577,617))
#' getIV(CreditData[c("gscore","target")], cutbound = c(617,Inf))
#' getIV(CreditData[c("gscore","target")], cutbound = NA)
getIV <- function(df,cutbound,totalgood=NULL,totalbad=NULL) {
ncl <- ncol(df)
nm <- names(df)
if(ncl != 2) stop("error in 'getIV' Function: the ncol of 'df' must be equal to 2")
totalY <- as.numeric(table(df[2]))
if(length(totalY) != 2) stop("error in 'getIV' Function: the number of Y values must be equal to 2")
if(is.null(totalgood)) totalg <- totalY[1] else totalg <- totalgood
if(is.null(totalbad)) totalb <- totalY[2] else totalb <- totalbad
X <- df[,1]
Y <- df[,2]
len_cut <- length(cutbound)
if(len_cut==1) {
if(is.na(cutbound)) {
totalY_sub <- as.numeric(table(Y[is.na(X)]))
} else {
totalY_sub <- as.numeric(table(Y[X==cutbound]))
}
if(length(totalY_sub)==1) stop(paste("purely binning error in 'getIV' Function. variable:'",nm[1],"', binning:'=",cutbound,"'",sep=""))
Good <- totalY_sub[1]
Bad <- totalY_sub[2]
} else if(len_cut==2) {
lb <- cutbound[1]
ub <- cutbound[2]
totalY_sub <- as.numeric(table(Y[X>lb & X<=ub]))
if(length(totalY_sub)==1) stop(paste("purely binning error in 'getIV' Function. variable:'",nm[1],"', binning:(",lb,",",ub,"]",sep=""))
Good <- totalY_sub[1]
Bad <- totalY_sub[2]
} else {
stop("error in 'getIV' Function: the length of parameter 'cutbound' is not 1 or 2")
}
IV <- ((Bad/totalb)-(Good/totalg))*log((Bad/totalb)/(Good/totalg))
return(IV)
}
#' @title Auxiliary Function: Compute Sum IV of Given Variable
#'
#' @description
#' Auxiliary function: \code{sumIV} will compute the sum IV of a binned X variable, return a numeric.
#'
#' @details
#' notice: the given X variable must have be binned in advance.
#'
#' @param df A dataframe only with Xs and Y variable, and the last variable must be Y.
#' @param x A name character of X variable to be computed.
#' @param totalgood Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#' @param totalbad Default \code{NULL}. if not default, will use them directly, otherwise will recalculate based on \code{df}.
#'
#' @return A numeric
#' @family WOE and IV computing functions
#' @export
#'
#' @examples
#' data(CreditData)
#' sumIV(CreditData, x = "gender")
sumIV <- function(df,x,totalgood=NULL,totalbad=NULL) {
ncl <- ncol(df)
nm <- names(df)
if(ncl < 2) stop("error in 'sumIV' Function: the ncol of 'df' Must be greater than or equal to 2")
totalY <- as.numeric(table(df[ncl]))
if(length(totalY) != 2) stop("error in 'sumIV' Function: the number of Y values must be equal to 2")
if(is.null(totalgood)) totalg <- totalY[1] else totalg <- totalgood
if(is.null(totalbad)) totalb <- totalY[2] else totalb <- totalbad
X <- df[,x]
Ynm <- nm[ncl]
mydf <- df[c(x,Ynm)]
Xvals <- unique(X)
IVs <- sapply(Xvals,getIV,df=mydf,totalgood=totalgood,totalbad=totalbad)
return(sum(IVs))
}
#' @title Compute Sum IV of All X Variables
#'
#' @description
#' \code{dfIV} is a wrapper of function \code{\link{sumIV}}, so it can compute the sum IV of every X variable in \code{df}.
#'
#' @details
#' notice: all the X variables must have been binned in advance, such as after woe-encoded.
#'
#' @param df A dataframe only with Xs and Y variable, all the X variables must have been binned in advance, such as after woe-encoded.
#'
#' @return A dataframe contains two columns, one is \code{varname}, another is \code{IV}
#' @family WOE and IV computing functions
#' @export
dfIV <- function(df) {
ncl <- ncol(df)
nm <- names(df)
Xnm <- nm[-ncl]
myIVs <- sapply(Xnm,sumIV,df=df,USE.NAMES = FALSE)
myres <- data.frame(varname=Xnm,IV=myIVs,stringsAsFactors=FALSE)
return(myres)
}
#' @title Collinearity Elimination Automatically
#'
#' @description
#' \code{collElimination} is a recursive function, which will recursivly eliminate the collinearity X variables with smaller IV automatically, then return a new dataframe.
#'
#' @details
#' for collinearity elimination, recursion is necessary to avoid mistakenly-deleted.
#'
#' @param df A dataframe only with Xs and Y variable, all the X variables must have been binned, such as after woe-encoded.
#' @param cor_critical Specifies the correlation coefficient threshold for extracting, default \code{0.8}.
#'
#' @return A dataframe after elimination
#' @family collinearity elimination functions
#' @importFrom dplyr group_by
#' @importFrom dplyr n
#' @importFrom dplyr summarise
#' @importFrom stats cor
#' @export
collElimination <- function(df,cor_critical=0.8) { #Recursive Function
# library(dplyr)
IVs <- dfIV(df) #return a dataframe, computing every X variable's IV
ncl <- ncol(df)
cordf <- as.data.frame(cor(df[-ncl]))
varnm <- names(cordf)
cor_critical_abs <- abs(cor_critical)
indx <- which(cordf >= cor_critical_abs | cordf <= -cor_critical_abs, arr.ind = TRUE)
var_coll <- data.frame(varname_row=varnm[indx[,1]],varname_col=varnm[indx[,2]])
var_summ <- as.data.frame(summarise(group_by(var_coll,varname_row),n=n()-1))
var_summ <- var_summ[var_summ$n>0,]
if(nrow(var_summ)>0) {
var_IV <- merge(var_summ,IVs,by.x="varname_row",by.y="varname")
currentvar <- var_IV[which.max(var_IV[,3]),1]
delvar_vec <- var_coll[var_coll[,1]==currentvar & var_coll[,2]!=currentvar,2]
delindx <- names(df) %in% delvar_vec
df <- df[!delindx]
return(collElimination(df,cor_critical))
} else {
return(df)
}
}
#' @title Collinearity Elimination by Giving Variables' IV
#'
#' @description
#' \code{collElimination2} is a recursive function, which will recursivly eliminate the collinearity X variables with smaller IV. Different from \code{\link{collElimination}}, it must provide parameter \code{IVdf}, so need not recalculate every variable's IV for speed up.
#'
#' @details
#' for collinearity elimination, recursion is necessary to avoid mistakenly-deleted.
#'
#' @param df A dataframe only with Xs and Y variable, all the X variables must have been binned, such as after woe-encoded.
#' @param IVdf A dataframe of every X variable's IV.
#' @param cor_critical Specifies the correlation coefficient threshold for extracting, default \code{0.8}.
#'
#' @return A dataframe after elimination
#' @family collinearity elimination functions
#' @importFrom dplyr group_by
#' @importFrom dplyr n
#' @importFrom dplyr summarise
#' @importFrom stats cor
#' @export
collElimination2 <- function(df,IVdf,cor_critical=0.8) {
# library(dplyr)
ncl <- ncol(df)
cordf <- as.data.frame(cor(df[-ncl]))
varnm <- names(cordf)
cor_critical_abs <- abs(cor_critical)
indx <- which(cordf >= cor_critical_abs | cordf <= -cor_critical_abs, arr.ind = TRUE)
var_coll <- data.frame(varname_row=varnm[indx[,1]],varname_col=varnm[indx[,2]])
var_summ <- as.data.frame(summarise(group_by(var_coll,varname_row),n=n()-1))
var_summ <- var_summ[var_summ$n>0,]
if(nrow(var_summ)>0) {
var_IV <- merge(var_summ,IVdf,by.x="varname_row",by.y="varname")
currentvar <- var_IV[which.max(var_IV[,3]),1]
delvar_vec <- var_coll[var_coll[,1]==currentvar & var_coll[,2]!=currentvar,2]
delindx <- names(df) %in% delvar_vec
df <- df[!delindx]
return(collElimination(df,cor_critical))
} else {
return(df)
}
}
#*************************Data Prebinning**************************
# @title Auxiliary Function: Optimal Binning for Single Continuous Variable
#
# @description
# Auxiliary function: a wrapper and enhancing of function \code{smbinning}.
#
# @param myx A name character of one continuous variable, \code{myx} must not have a dot.
# @param df A data frame of dataset consisting of only Xs and Y variables, the last column must be Y.
# @param binMethod An integer from 1 to 5, indicates 5 different binning methods.
# @param per A numeric, means percentage of records per bin, from 0 to 0.5.
# @param aliquots An integer, specifies the number of bins for equal-frequency or equal-interval binning method.
#
# @return A list
# @family dataset binning and woe-encoding functions
# @importFrom smbinning smbinning
# @importFrom smbinning smbinning.custom
# @export
#
# @examples
#
mysmbinning <- function(myx,df,binMethod,per,aliquots) {
##Auxiliary function
# library(smbinning)
nm <- names(df)
ncl <- ncol(df)
mydf <- df[c(myx,nm[ncl])] #Subset for speeding up and Reducing memory occupation
if(binMethod==1) { #optimal binning, and equal-frequency binning is an alternative when optimal binning is not available
res <- smbinning(mydf,y=nm[ncl],x=myx,p=per)
if(is.data.frame(res[[1]])) { #if x can be Optimally Binned
res[[3]] <- NULL #remove the 'ctree' element
} else {
cuts <- getEqualFreqCuts(mydf[,1],aliquots)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=myx,cuts=cuts)
}
} else if(binMethod==2) { #optimal binning, and equal-interval binning is an alternative when optimal binning is not available
res <- smbinning(mydf,y=nm[ncl],x=myx,p=per)
if(is.data.frame(res[[1]])) {
res[[3]] <- NULL
} else {
xmin <- min(mydf[,1],na.rm=TRUE)
xmax <- max(mydf[,1],na.rm=TRUE)
cuts <- seq(xmin,xmax,length.out=aliquots + 1)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=myx,cuts=cuts)
}
} else if(binMethod==3) { #equal-frequency binning
cuts <- getEqualFreqCuts(mydf[,1],aliquots)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=myx,cuts=cuts)
} else if(binMethod==4) { #equal-interval binning
xmin <- min(mydf[,1],na.rm=TRUE)
xmax <- max(mydf[,1],na.rm=TRUE)
cuts <- seq(xmin,xmax,length.out=aliquots + 1)
cuts <- cuts[2:(length(cuts)-1)]
res <- smbinning.custom(mydf,y=nm[ncl],x=myx,cuts=cuts)
} else if(binMethod==5) { #only optimal binning
res <- smbinning(mydf,y=nm[ncl],x=myx,p=per)
if(is.data.frame(res[[1]])) { #if x can be Optimally Binned
res[[3]] <- NULL #remove the 'ctree' element
}
}
return(res)
}
# @title Auxiliary Function: Optimal Binning for Single Factor Variable
#
# @description
# Auxiliary function: a wrapper and enhancing of function \code{smbinning.factor}.
#
# @param myx A name character of one factor variable(if not a factor, must be converted in advance), \code{myx} must not have a dot.
# @param df A data frame of dataset consisting of only Xs and Y variables, the last column must be Y.
#
# @return A list
# @family dataset binning and woe-encoding functions
# @importFrom smbinning smbinning.factor
# @export
#
# @examples
#
mysmbinning.factor <- function(myx,df) {
##Auxiliary function
# library(smbinning)
nm <- names(df)
ncl <- ncol(df)
mydf <- df[c(myx,nm[ncl])] #Subset for speeding up and Reducing memory occupation
res <- smbinning.factor(mydf,y=nm[ncl],x=myx,maxcat=20)
return(res)
}
#' @title Variable Pre-Binning, then Computing WOE and IV
#'
#' @description
#' \code{preBinningFun} is often used to pre-binning with returning plots and a lots of details for variables filtering, and it has integrated 5 different binning methods(see details). Important notes: the calculated WOE value in this function is the opposite of the actual value.
#'
#' @details
#' \code{binMethod=c(1,2,3,4,5)}, means:
#' 1 means optimal binning, and equal-frequency binning is an alternative when optimal binning is not available.
#' 2 means optimal binning, and equal-interval binning is an alternative when optimal binning is not available.
#' 3 means equal-frequency binning.
#' 4 means equal-interval binning.
#' 5 means optimal binning only.
#'
#' this function will generate four files in current directory, including 'binGraph.pdf', 'varSummary.csv', 'summaryIV.csv' and 'insignificantVars.csv'(if it exists), as well as mass csv files in '~/binDetails/' subdirectory. If the subdirectory does not exist, it will be created automatically.
#'
#' @param mydata A data frame of dataset consisting of only Xs and Y variables, the last column must be Y. All character x variables must be converted to factors in advance.
#' @param binMethod An integer from 1 to 5, indicates 5 different binning methods(see details), default \code{1}.
#' @param p A numeric, means percentage of records per bin, from 0 to 0.5, default \code{0.05}.
#' @param aliquots An integer, specifies the number of bins for equal-frequency or equal-interval binning method, default \code{5}.
#' @param mydict Optional, default \code{NULL}. File name character of variable dictionary with csv file extension, or a dataframe representing the variable dictionary, see details of this parameter in \code{\link{fread_basedict}}.
#'
#' @return A list
#' @family dataset binning and woe-encoding functions
#' @importFrom data.table fread
#' @importFrom smbinning smbinning.plot
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterEvalQ
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#' @importFrom showtext showtext_auto
#' @importFrom vcd spine
#' @importFrom grDevices pdf
#' @importFrom grDevices dev.off
#' @importFrom graphics boxplot
#' @importFrom graphics mtext
#' @importFrom graphics par
#' @importFrom utils write.csv
#' @importFrom sysfonts font_add
#' @export
preBinningFun <- function(mydata,binMethod=1,p=0.05,aliquots=5,mydict=NULL) {
if(!binMethod %in% c(1,2,3,4,5)) stop("the parameter of 'binMethod' is not in c(1,2,3,4,5)")
if(aliquots<2) stop("the parameter of 'aliquots' is less than 2")
aliquots <- floor(aliquots)
# library(smbinning)
# library(parallel)
# library(showtext)
# library(vcd)
if(is.character(mydict)) {
if(length(mydict) != 1) stop("the length of parameter 'mydict' must be equal to 1 if it is a character string")
if(substr(tolower(mydict),nchar(mydict)-3,nchar(mydict)) != ".csv") stop("the parameter of 'mydict' must contain CSV file extension if it is a character string")
vardict <- fread(mydict,colClasses=list(character=c(1:2,4),numeric=c(3)),data.table=FALSE,na.strings="")
} else if(is.data.frame(mydict)) {
if(ncol(mydict) < 4) stop("the ncol of parameter 'mydict' must be more than or equal to 4 if it is a dataframe")
vardict <- mydict
} else if(is.null(mydict)) {
vardict <- data.frame(character(0),character(0),numeric(0),character(0))
} else {
stop("the parameter of 'mydict' must be a character string or a dataframe or default NULL")
}
cat("Data preparation...Please wait","\n")
summ <- summary(mydata)
summ <- t(summ)
write.csv(summ,"varSummary.csv",na = "")
mydata[,ncol(mydata)][mydata[,ncol(mydata)] == 0] <- 2
mydata[,ncol(mydata)][mydata[,ncol(mydata)] == 1] <- 0
mydata[,ncol(mydata)][mydata[,ncol(mydata)] == 2] <- 1
var_numeric <- sapply(mydata,is.numeric)
mydata_num <- mydata[var_numeric]
cat("dim of numeric variables:","\n")
print(dim(mydata_num[-length(mydata_num)]))
mydata[,ncol(mydata)] <- as.factor(mydata[,ncol(mydata)])
var_factor <- sapply(mydata,is.factor)
mydata_fact <- mydata[var_factor]
#be careful! if x is a factor, as.numeric will not return the desired result
mydata_fact[,ncol(mydata_fact)] <- as.numeric(as.character(mydata_fact[,ncol(mydata_fact)]))
cat("dim of factor variables:","\n")
print(dim(mydata_fact[-length(mydata_fact)]))
rm(mydata,summ)
gc()
x_num <- names(mydata_num)[-ncol(mydata_num)]
x_fact <- names(mydata_fact)[-ncol(mydata_fact)]
clnum <- detectCores()
cl <- makeCluster(getOption("cl.cores",clnum))
gEFC_env <- environment(getEqualFreqCuts)
clusterEvalQ(cl,library(smbinning))
clusterExport(cl,"getEqualFreqCuts",envir = gEFC_env)
cat("Parallel computing...This will take a while","\n")
myres <- parLapply(cl,x_num,mysmbinning,df=mydata_num,binMethod=binMethod,per=p,aliquots=aliquots) #Enable maximum number of core for parallel computing
myres_fact <- parLapply(cl,x_fact,mysmbinning.factor,df=mydata_fact)
stopCluster(cl)
myres <- lapply(myres,"[",c(1:2,4:6))
myres <- c(myres,myres_fact) #combine the list of myres with myres_fact
mydata_num[,ncol(mydata_num)][mydata_num[,ncol(mydata_num)] == 0] <- 2 #recover the transcoding of Y
mydata_num[,ncol(mydata_num)][mydata_num[,ncol(mydata_num)] == 1] <- 0
mydata_num[,ncol(mydata_num)][mydata_num[,ncol(mydata_num)] == 2] <- 1
mydata_fact[,ncol(mydata_fact)][mydata_fact[,ncol(mydata_fact)] == 0] <- 2
mydata_fact[,ncol(mydata_fact)][mydata_fact[,ncol(mydata_fact)] == 1] <- 0
mydata_fact[,ncol(mydata_fact)][mydata_fact[,ncol(mydata_fact)] == 2] <- 1
mydata_num_nm <- names(mydata_num[-ncol(mydata_num)])
mydata_fact_nm <- names(mydata_fact)
mydata_combine_nm <- c(mydata_num_nm,mydata_fact_nm)
mydata_num_ncol_x <- length(mydata_num_nm)
ivtable <- list()
for(i in 1:length(myres)) {
ivtable[[i]] <- myres[[i]][[1]]
if(is.data.frame(myres[[i]][[1]]))
names(ivtable)[i] <- myres[[i]][[3]]
else
names(ivtable)[i] <- mydata_combine_nm[i]
}
summiv <- data.frame(name=character(0),summaryIV=numeric(0),stringsAsFactors=FALSE)
insignvar <- data.frame(name=character(0),stringsAsFactors=FALSE)
opar <- par(no.readonly=TRUE)
showtext_auto(enable=TRUE)
tryCatch(font_add("msyh","msyh.ttf"),error=function(e) {font_add("msyh","msyh.ttc")})
pdf("binGraph.pdf")
par(family="msyh")
mypath <- getwd()
n1 <- 0
n2 <- 0
for(i in 1:length(myres)) {
if(is.data.frame(myres[[i]][[1]])) {
n1 <- n1 + 1
varrow <- which(vardict[,1]==myres[[i]][[3]])
if(length(varrow)!=0)
varnm_ch <- vardict[,2][varrow[1]]
else
varnm_ch <- myres[[i]][[3]]
par(mfrow=c(2,2))
#the "," in mydata_num[,myres[[i]][[3]]] is necessary
if(i<=mydata_num_ncol_x) {
boxplot(mydata_num[,myres[[i]][[3]]] ~ mydata_num[,ncol(mydata_num)],data=mydata_num, horizontal=T, frame=F, col="lightgray",main="Distribution")
mtext(paste("IV:",myres[[i]][[2]]),3)
}
# } else {
# counts <- table(mydata_fact[,myres[[i]][[3]]],mydata_fact[,ncol(mydata_fact)],useNA="ifany")
# if(nrow(counts)>10) {
# counts <- counts[1:10,]
# spine(counts,main="Distribution of top 10")
# mtext(paste("IV:",myres[[i]][[2]]),3)
# } else {
# spine(counts,main="Distribution")
# mtext(paste("IV:",myres[[i]][[2]]),3)
# }
# }
if(i<=mydata_num_ncol_x)
smbinning.plot(myres[[i]],option="dist",sub=varnm_ch)
else
smbinning.plot(myres[[i]],option="dist",sub=paste("IV:",myres[[i]][[2]]))
smbinning.plot(myres[[i]],option="badrate",sub=varnm_ch)
if(sum(is.infinite(ivtable[[i]]$WoE)) == 0)
smbinning.plot(myres[[i]],option="WoE",sub=varnm_ch)
summiv[n1,1] <- varnm_ch
summiv[n1,2] <- myres[[i]][[2]]
} else {
n2 <- n2 + 1
varrow <- which(vardict[,1]==mydata_combine_nm[i])
if(length(varrow)!=0)
varnm_ch <- vardict[,2][varrow[1]]
else
varnm_ch <- mydata_combine_nm[i]
par(mfrow=c(1,1))
if(i<=mydata_num_ncol_x) {
boxplot(mydata_num[,mydata_combine_nm[i]] ~ mydata_num[,ncol(mydata_num)],data=mydata_num, horizontal=T, frame=F, col="lightgray",main="Distribution")
mtext(varnm_ch,3)
} else {
counts <- table(mydata_fact[,mydata_combine_nm[i]],mydata_fact[,ncol(mydata_fact)],useNA="ifany")
if(nrow(counts)>7) {
counts <- counts[1:7,]
spine(counts,main="Distribution of top 7")
mtext(varnm_ch,3)
} else {
spine(counts,main="Distribution")
mtext(varnm_ch,3)
}
}
insignvar[n2,1] <- varnm_ch
}
if(!dir.exists(paste(mypath,"/binDetails/",sep=""))) dir.create(paste(mypath,"/binDetails/",sep=""))
file=paste(mypath,"/binDetails/",varnm_ch,".csv",sep="")
write.csv(ivtable[[i]],file)
}
file1 <- paste(mypath,"/summaryIV.csv",sep="")
write.csv(summiv,file1)
if(length(insignvar[,1])>0) {
file2 <- paste(mypath,"/insignificantVars.csv",sep="")
write.csv(insignvar,file2)
}
par(opar)
dev.off()
results <- list(myres,ivtable)
cat("completed","\n")
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.