R/codeWOE.R

#' Code WOE value for each band of variable.
#'
#' @param x An object of Clusterrr class
#' @param xVar A vector of variable to match with WOE
#' @return Vector with coded WOE values
#' @examples
#' data(lendclub)
#' x <- doClustering(lendclub, "grade", "loan_status")
#' codeWOE(x, lendclub$grade)
#' #how to build fast model using package?
#' #assuming 2 characteristic was chosen to build a logistic regression model:
#' x <- doClustering(lendclub, "purpose", "loan_status")
#' purposeCoded <- codeWOE(x, lendclub$purpose)
#' x <- doClustering(lendclub, "grade", "loan_status")
#' gradeCoded <- codeWOE(x, lendclub$grade)
#' dt <- data.frame(y = lendclub$loan_status,
#'                  x1 = purposeCoded,
#'                  x2 = gradeCoded)
#' #divide population into 2 sets for learning and validation:
#' dt.train <- dt[1:500000,]
#' dt.test <- dt[500000:nrow(lendclub),]
#' #build fast model:
#' model <- glm(data = dt.train, formula = y ~ x1 + x2, family = "binomial")
#' dt.test$prediction <- predict(model, dt.test, type = "response")
#' #check the strength of fit:
#' #pROC::auc(ifelse(dt.test$y,1,0), dt.test$prediction ) * 2 - 1 #gini
#' #pROC::plot.roc(ifelse(dt.test$y,1,0), dt.test$prediction ) # ROC curve
#' @export


codeWOE <- function (x, xVar){

  if(class(x) != "Clusterrr")
    stop("needs Clusterrr object")

  woe <- x$woe
  hcGroup <- x$hcGroup

  #matching WOE codes with the variable
  xVar2 <- as.factor(hcGroup[match(xVar,names(hcGroup))])

  #if the WOE is to high in the band, assume it is equal 10
  woe[woe == "-Inf"] <- -10
  woe[woe == "Inf"] <- 10

  if(length(woe)==1 | all(woe==1)) {
    woeVect <- rep(1, length(xVar))
  }else{
    woeVect <- woe[match(xVar2, names(woe))]
    names(woeVect) <- names(xVar2)
  }
  woeVect
}
wojciechoblak/varbinq documentation built on May 4, 2019, 9:46 a.m.