R/continuous.R

Defines functions WoETableContinuous

#' @include common.R
#'
#' Weight of Evidence and Information Value for a continuous variable
#'
#' This function calculates the Weight of Evidence and Infomation value for a particular response.
#' The variable tested is continuous. The response is assumed to be logical. If not, if
#' the response takes only 2 values (either factor or numerical), it will be transformed into
#' logical values. `verbose = TRUE` shows the conversion.
#'
#' @param df Dataframe containing the two columns of data
#' @param x  Name of the continuous variable as a string
#' @param y  Name of the boolean response as a string
#' @param p  Parameter for the Conditional Recuresive Tree used (see \code{partykit})
#' @param verbose Boolean to add additional information. Default is \code{FALSE}
#'
#' @return A list with the total Information Value, and a tibble containing the bins with detailed
#'  information.
#'
#' @import checkmate
#' @import tidyverse
#' @import partykit
#'
#' @export
#'
#' @examples [TODO]
WoETableContinuous <-
  function(df, x, y, p = 0.05, verbose = FALSE) {
    assertDataFrame(df)
    assertString(x)
    assertString(y)
    assertNumber(p, lower = 0, upper = 0.5)
    assertFlag(verbose)

    # Make symbol out of name strings
    xSym <- sym(x)
    ySym <- sym(y)

    # Check type of y
    if (class(df[[1, y]]) == "logical") {
      if (verbose == TRUE) {
        cat("y (", y, ") is logical: OK \n")
      }
    } else {
      if (verbose == TRUE) {
        cat("y (", y, ") is not logical. Attempting conversion \n")
      }

      # Transforms y to boolean
      df <- ensureLogical(df, y, verbose = verbose)
    }

    if (verbose == TRUE) {
      cat("Start recursive partitioning. \n")
      startTime <- Sys.time()
    }

    # Conditional recursive partioning requires 0/1 variable
    treeFrame <- df %>%
      dplyr::mutate(!!ySym := if_else(!!ySym, 1, 0)) %>%
      dplyr::select(!!xSym, !!ySym) %>%
      as.data.frame()

    treeControl <-
      ctree_control(minbucket = ceiling(round(p * nrow(df))))

    binTree <- ctree(
      formula(paste(y, "~", x)),
      data = treeFrame,
      na.action = na.exclude,
      control = treeControl
    )

    if (verbose == TRUE) {
      cat("Done. Elapsed: ", Sys.time() - startTime, "\n")
    }
    if (verbose == TRUE) {
      cat("Recursive tree: \n")
      print(binTree)
    }

    bins <- width(binTree)
    if (verbose == TRUE) {
      cat("Number of bins = ", bins, "\n")
    }
    if (!testNumber(bins, lower = 2)) {
      warning(
        "Number of bins for x is not at least 2. Returning NA. (Hint: maybe try to convert x from numeric to factors)\n"
      )
      return(NA)
    }

    # Number of nodes
    nNodes <- length(binTree)
    if (verbose == TRUE) {
      cat("Number of nodes = ", nNodes, "\n")
    }


    # Collect all the cutting points
    listCuts <- tibble(CutNumber = 1:nNodes, CutPoint = 0.0)
    for (i in 1:nNodes) {
      listCuts[i, 2] <-
        ifelse(is.numeric(binTree[i]$node$split$breaks),
               binTree[i]$node$split$breaks,
               NA)
    }

    # Min and Max of x
    xMin <-
      min(df %>% dplyr::select(!!x) %>% dplyr::filter(!is.na(!!xSym)))

    xMax <-
      max(df %>% dplyr::select(!!x) %>% dplyr::filter(!is.na(!!xSym)))

    # Clean up the list of Cuts and add one for everything above the last cut
    listCuts <- listCuts %>%
      dplyr::filter(!is.na(CutPoint)) %>%
      dplyr::arrange(CutPoint) %>%
      dplyr::add_row(CutPoint = -Inf, .before = 1) %>%
      dplyr::add_row(CutPoint = xMax) %>%
      dplyr::mutate(CutNumber = row_number() - 1)


    # Prepare a result tibble
    result <- listCuts %>%
      dplyr::mutate(
        name        = "",
        Min         = 0.0,
        Max         = 0.0,
        Count       = 0.0,
        nGood       = 0.0,
        nBad        = 0.0,
        cumCount    = 0.0,
        cumGood     = 0.0,
        cumBad      = 0.0,
        pctCount    = 0.0,
        pctGood     = 0.0,
        pctBad      = 0.0,
        OddsInBin   = 0.0,
        LnOddsInBin = 0.0,
        WoEInBin    = 0.0,
        IVInBin     = 0.0,
        WoE         = 0.0,
        IV          = 0.0
      )

    # Calculate total Goods and Bads
    totalGood  <- (df %>% dplyr::filter(!!ySym == TRUE)  %>% nrow())
    totalBad   <- (df %>% dplyr::filter(!!ySym == FALSE) %>% nrow())
    totalCount <- totalGood + totalBad

    totalBins <- nrow(listCuts)

    for (currentCut in 2:totalBins) {
      minBinCut <- as.numeric(listCuts[currentCut - 1, "CutPoint"])
      maxBinCut <-
        as.numeric(listCuts[currentCut,     "CutPoint"])

      if (verbose == TRUE) {
        cat(" Cut row number: ",
            currentCut,
            ", min = ",
            minBinCut,
            ", max = ",
            maxBinCut,
            "\n")
      }

      xBand <- df %>%
        dplyr::select(!!xSym, !!ySym) %>%
        dplyr::filter(between(!!xSym, minBinCut, maxBinCut) &
                        (!!xSym) != minBinCut)

      result[currentCut, "Min"]   <- minBinCut
      result[currentCut, "Max"]   <- maxBinCut
      result[currentCut, "Count"] <- nrow(xBand)
      result[currentCut, "nGood"] <- nrow(xBand %>% dplyr::filter(!!ySym == TRUE))
      result[currentCut, "nBad"]  <- nrow(xBand %>% dplyr::filter(!!ySym == FALSE))

      result[currentCut, "Name"]  <- paste0(as.character(minBinCut), "<",
                                            as.character(currentCut), "<=",
                                            as.character(maxBinCut))
    }

    # Remove any NA's in case some categories didn't have any true or false
    # otherwise this immediately leads to infinite Information Values
    result[is.na(result)] <- 0


    # Add row for missing / NA values
    xBand <- df %>%
      dplyr::select(!!xSym, !!ySym) %>%
      dplyr::filter(is.na(!!xSym) & !is.na(!!ySym))

    if (verbose == TRUE) {
      cat("Number of missing/NA values: ", nrow(xBand), "\n")
    }
    if (nrow(xBand) > 0) {
      result <- result %>% add_row(CutNumber = totalBins)

      result[totalBins + 1, "CutPoint"] <- NA
      result[totalBins + 1, "Min"]      <- NA
      result[totalBins + 1, "Max"]      <- NA
      result[totalBins + 1, "Count"]    <- nrow(xBand)
      result[totalBins + 1, "nGood"]    <- nrow(xBand %>% dplyr::filter(!!ySym == TRUE))
      result[totalBins + 1, "nBad"]     <- nrow(xBand %>% dplyr::filter(!!ySym == FALSE))
    }

    # Fill the rest of the columns
    result <-
      result %>%
      dplyr::mutate(
        cumCount    = cumsum(Count),
        cumGood     = cumsum(nGood),
        cumBad      = cumsum(nBad),

        pctCount    = Count / totalCount,
        pctGood     = nGood / totalGood,
        pctBad      = nBad  / totalBad,
        pctGoodBin  = nGood / Count,
        pctBadBin   = nBad  / Count,

        OddsInBin   = pctGoodBin / (1 - pctGoodBin),
        LnOddsInBin = log(OddsInBin),

        WoEInBin    = log(pctGoodBin) - log(pctBadBin),
        IVInBin     = (pctGoodBin - pctBadBin) * WoE,

        WoE         = log(pctGood) - log(pctBad),
        IV          = (pctGood - pctBad) * WoE
      )

    # Remove the first -Infinity cut point
    result <- result %>% slice(2:n())

    return(list(
      IV = sum(result$IV[!is.infinite(result$IV)]),
      type = "numeric",
      table = result
    ))
  }
Emmanuel-R8/SMBinning documentation built on Dec. 9, 2019, 6:39 p.m.