R/bin.valid_Function.R

Defines functions bin.valid

bin.valid <- function(x, y, data, p = 0.05, q = 0.01, s = 0.1, abbrev_length = 20, event_class = 1,
                      train_samples = 10, n_tests = 10, prop = 0.6, seed = 123) {
  
  require(woeBinning)
  data <- data[, c(y,x)] %>% as.data.frame()
  
  #----- Define Variables
  woe_train <- NULL
  woe_tree_train <- NULL
  woe_mse <- NULL
  woe_tree_mse <- NULL
  
  #----- Set Seeds for Train
  set.seed(seed = seed)
  train_seeds <- sample.int(1000, train_samples)
  
  #----- Create Train WoE(s) through random sampling
  for (i in 1:train_samples) {
    set.seed(train_seeds[i])
    TrainIndex <- createDataPartition(as.factor(data[, y]), 
                                      p = prop, list = F)
    df_train <- data[TrainIndex,]
    # df_test <- data[-TrainIndex,]
    
    woe_train[[i]] <- woe.binning(
      df = df_train,
      target.var = y,
      pred.var = x,
      min.perc.total = p,
      min.perc.class = q,
      stop.limit = s,
      event.class = event_class,
      abbrev.fact.levels = abbrev_length
    )
    
    woe_tree_train[[i]] <- woe.tree.binning(
      df = df_train,
      target.var = y,
      pred.var = x,
      min.perc.total = p,
      min.perc.class = q,
      stop.limit = s,
      event.class = event_class,
      abbrev.fact.levels = abbrev_length
    )
    
  }
  
  #----- Apply Train WoE on random test samples
  for (j in 1:length(woe_train)) {
    
    woe_test <- NULL
    woe_tree_test <- NULL
    
    set.seed(seed = seed)
    test_seeds <- sample.int(1000, n_tests)
    
    for (k in 1:n_tests) {
      
      set.seed(test_seeds[k])
      TrainIndex <- createDataPartition(as.factor(data[, y]), 
                                        p = prop, list = F)
      df_test <- data[-TrainIndex,]
      
      df_test_woe <- woe.binning.deploy(df = df_test, woe_train[[j]])
      names(df_test_woe) <- gsub(pattern = '\\.', replacement = '_', x = names(df_test_woe))
      df_test_woe[, paste(x, '_binned', sep = '')] <- gsub(pattern = '\\,', replacement = '_', x = df_test_woe[, paste(x, '_binned', sep = '')])
      
      df_test_woe_tree <- woe.binning.deploy(df = df_test, woe_tree_train[[j]])
      names(df_test_woe_tree) <- gsub(pattern = '\\.', replacement = '_', x = names(df_test_woe_tree))
      df_test_woe_tree[, paste(x, '_binned', sep = '')] <- gsub(pattern = '\\,', replacement = '_', x = df_test_woe_tree[, paste(x, '_binned', sep = '')])
      
      woe_test[k] <- bins(x = paste(x, '_binned', sep = ''), y = y, data = df_test_woe, type = 'f')$info$iv
      woe_tree_test[k] <- bins(x = paste(x, '_binned', sep = ''), y = y, data = df_test_woe_tree, type = 'f')$info$iv
      
    }
    
    #-- Calculate MSE for each Train WoE
    woe_mse[j] <- (woe_train[[1]][[3]] - woe_test)^2 %>% mean(na.rm = T)
    woe_tree_mse[j] <- (woe_tree_train[[1]][[3]] - woe_tree_test)^2 %>% mean(na.rm = T)
    
  }
  
  #----- Output
  ds_results <- tibble(
    WoE = woe_train,
    WoE_MSE = woe_mse
  ) %>% 
    arrange(WoE_MSE) %>% 
    slice(1)
  
  ds_tree_results <- tibble(
    WoE = woe_tree_train,
    WoE_MSE = woe_tree_mse
  ) %>% 
    arrange(WoE_MSE) %>% 
    slice(1)
  
  output <- list(WoE = ds_results$WoE[1][[1]], WoE_MSE = ds_results$WoE_MSE,
                 WoE_Tree = ds_tree_results$WoE[1][[1]], WoE_Tree_MSE = ds_tree_results$WoE_MSE)
  
}
Ehsan-F/R-Mixtape documentation built on June 24, 2020, 12:22 a.m.