R/encodings.R

Defines functions frequencyEncode rareEncode uniformEncode gaussianEncode boxCoxEncode naEncode dummyEncode applyEncoding applyBoxCox applyUniform applyDummy applyRFEncoding applyGaussian

Documented in applyEncoding boxCoxEncode dummyEncode frequencyEncode gaussianEncode naEncode rareEncode uniformEncode

#' @title frequencyEncode
#' @description Re-encodes categorical data as its frequency in the dataset. Useful for gradient boosting.
#' Does NOT return dataset, but an object that can be applied to a dataset with the \code{applyEncoding} function.
#' If your data contains missing values, be very careful with the \code{encodeNA} and \code{allowNewLevels} parameters.
#' @param dt data.frame(table) to create the object on
#' @param vars vector of variables you want to frequency-encode
#' @param encodeNA Boolean. Should NAs be encodes as a frequency, or kept as NA when the transformation is applied? If
#' there are no NAs in your original data, new NAs will still be encoded as 1. Risky, but easy.
#' @param allowNewLevels Should any new levels be encoded as -1? Details:
#' \itemize{
#'   \item \code{TRUE}   Encodes new levels as -1. This is dangerous if your levels can change in the future, because you won't notice, and the model may not be tuned correctly.
#'   \item \code{FALSE}  Throws an error. You'll need to figure out how you want to proceed if \code{allowNewLevels = TRUE} is not good enough.
#' }
#' @return Frequency Encoded Object. This needs to be applied to a dataset, it will not actually return a dataset.
#' @importFrom utils stack
#' @importFrom data.table data.table setnames :=
#' @export
frequencyEncode <- function(
    dt
  , vars
  , encodeNA = FALSE
  , allowNewLevels = FALSE
) {

  # Testing
    # require(data.table)
    # dt <- catEncoding
    # vars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
    # encodeNA = FALSE
    # allowNewLevels <- FALSE

  if (any(!vars %in% names(dt))) stop(paste0(vars[!vars %in% names(dt)]," are not names in dt."))
  if (encodeNA) table <- function(x) base::table(x,useNA = "ifany")

  dt <- copy(dt[,vars, with = F])

  encodList <- list()
  encodList$vars <- vars
  encodList$encodeNA <- encodeNA
  encodList$allowNewLevels <- allowNewLevels
  encodList$tables <- list()

  for (i in vars) {

    # i <- vars[[2]]

    # Needs to be handle all 4 cases of encodeNA and allowNewLevels,
    # As well as when NA is a new level.
      # FALSE:FALSE
      # FALSE:TRUE
      # TRUE:FALSE
      # TRUE:TRUE

    # Create a frequency table
    freqTab <- data.table(stack(table(dt[[i]])))[order(-get("values"))]

    # Do we need to add NA, or has it already been added?
    addNA <- !any(is.na(freqTab$ind))

    # table manipulations
    freqTab$ind <- as.character(freqTab$ind)
    setnames(freqTab, c("freq",i))
    freqTab[,"enc" := nrow(freqTab):1]

    # Add NA and new level specifications
    if (addNA & encodeNA) freqTab <- rbindlist(list(freqTab,list(0,NA,0)))
    if (addNA & !encodeNA) freqTab <- rbindlist(list(freqTab,list(0,NA,NA)))
    if (allowNewLevels) freqTab <- rbindlist(list(freqTab,list(0,"__NEWLEVEL__",-1)))
    if (!allowNewLevels) freqTab <- rbindlist(list(freqTab,list(0,"__NEWLEVEL__",NA)))

    encodList$tables[[i]] <- freqTab

  }

  class(encodList) <- "freqDefs"

  return(encodList)

}


#' @title rareEncode
#' @description Groups rare levels together, where 'rare' is defined by either minPerc or minCount.
#' @param dt Dataset to create object on.
#' @param vars variables you want to include in the encoding.
#' @param minPerc Minimum percentage of the total population required for a group to be considered 'rare'.
#' For example, if \code{minPerc = 0.1}, and \code{Gender = 'Female'} only makes up 5% of the dataset, then
#' \code{Female} would be replaced with the string in \code{newString}.
#' @param minCount Can be provided instead of minPerc if you have a better idea of credibility in terms of total
#' samples required. Levels with counts below this value will be grouped into \code{newstring}.
#' @param newString Value you want to group rare levels together as.
#' @param encodeNA If FALSE, NAs will remain NA even if they would have been grouped.
#' If TRUE, NAs will be grouped, but only if they are 'rare'. If they are not rare, NAs will be untouched.
#' @param allowNewLevels Similar to frequencyEncode parameter. This groups all new levels in with \code{newString}.
#' @return Rare Encoded Object
#' @export
rareEncode <- function(
    dt
  , vars
  , minPerc = NULL
  , minCount = NULL
  , newString = "rareGroup"
  , encodeNA = FALSE
  , allowNewLevels = FALSE
) {

  # Testing
    # require(data.table)
    # dt <- catEncoding
    # vars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
    # minPerc = 0.05
    # minCount = NULL
    # encodeNA = FALSE
    # allowNewLevels = TRUE
    # newString = "rareGroup"


  rareObj <- list()
  rareObj$vars <- vars
  rareObj$minPerc <- minPerc
  rareObj$minCount <- minCount
  rareObj$newString <- newString
  rareObj$encodeNA <- encodeNA
  rareObj$allowNewLevels <- allowNewLevels
  rareObj$tables <- list()

  # Cannot provide both minPerc and minCount
  if((is.null(minPerc) & is.null(minCount)) | (!is.null(minPerc) & !is.null(minCount))) stop("Provide only 1 of minPerc and minCount")

  # Create minPerc from minCount
  if (is.null(minPerc)) minPerc <- minCount/nrow(dt)

  # Need sensible bounds
  if(minPerc <= 0 | minPerc >= 1) stop("minPerc represents a percentage, and should be between 0 and 1.")

  # A convenient shortcut.
  freqObj <- frequencyEncode(dt, vars, encodeNA = encodeNA, allowNewLevels = allowNewLevels)

  rareObj$tables <- lapply(freqObj$tables, function(x) {
    x$freq <- x$freq/sum(x$freq)
    if(encodeNA) x$enc <- ifelse(x$freq < minPerc, newString, as.character(x[[2]]))
    if(!encodeNA) x$enc <- ifelse(x$freq < minPerc & !is.na(x[[2]]), newString, as.character(x[[2]]))
    return(x)
    }
  )

  class(rareObj) <- "rareDefs"

  return(rareObj)

}


#' @title uniformEncode
#' @description Puts data from 1 range into another using a linear mapping. i.e. map data in {0-1} to {0-100}
#' @param dt Dataset to create object on.
#' @param vars variables you want to include in the encoding.
#' @param oldRange The old range of the variable. This can be user specified if you believe your data does not encompass the entire true range.
#' @param newRange the new range the map the values to. This can either be a vector, or a named list of vectors if you want different ranges for each variable (not typical).
#' @param encodeNA Values to set NA to. Can be a scalar or a named list of scalars (different values for each variable) if you are doing something complicated.
#' If TRUE, NAs will be grouped, but only if they are 'rare'. If they are not rare, NAs will be untouched.
#' @return Rare Encoded Object
#' @export
uniformEncode <- function(
    dt
  , vars
  , oldRange = lapply(dt[,vars,with=FALSE],function(x) return(c(min(x,na.rm=TRUE),max(x,na.rm=TRUE))))
  , newRange = c(0,1)
  , encodeNA = NA
) {

  # Testing
    # require(data.table)
    # dt <- readRDS("Data/numericEncodings.RDS")
    # vars <- c("LotFrontage","LotArea","GarageCars","BsmtFinSF2")
    # oldRange2 = c(10,20)
    # newRange = c(0,1)
    # encodeNA = NA
    # encodeNA = 0

  # data fidelity
  if (any(!vars %in% names(dt))) stop(paste0(vars[!vars %in% names(dt)]," are not names in dt."))
  testRanges <- function(x) {
    #x <- oldRange
    if(!class(unlist(x)) %in% c("numeric","integer")) stop("Range must be numeric")
    if(!length(unlist(x)) == 2) {
      if(!any(lapply(x,length) == 2)) stop("Range must be a list of vectors of length 2")
      if(!any(names(x) %in% names(dt))) stop(paste0(names(x)[!names(x) %in% names(dt)]," are not names in dt."))
      if(!setequal(names(x),vars)) stop("Names in Range are not the same as vars")
    }
  }
  testRanges(oldRange)
  testRanges(newRange)


  # Make into a list if they are not already.
  if(length(unlist(oldRange)) == 2) {
    oldRange <- lapply(vars,function(x) oldRange)
    names(oldRange) <- vars
  }
  if(length(unlist(newRange)) == 2) {
    newRange <- lapply(vars,function(x) newRange)
    names(newRange) <- vars
  }

  # Janky, but this needs to ocur after we blow up oldRange and newRange
  if (!setequal(vars,names(oldRange))) stop("Names in vars and oldRange not the same.")
  if (!setequal(vars,names(newRange))) stop("Names in vars and newRange not the same.")

  outsideRange <- sapply(vars,function(x) length(which(dt[[x]] < oldRange[[x]][[1]] | dt[[x]] > oldRange[[x]][[2]]))) > 0
  if(any(outsideRange)) warning(paste0("The following variables had values in the data outside the oldRange. This will result in values outside newRange: ",paste0(names(outsideRange)[outsideRange],collapse = ", ")))

  uniformObj <- list()
  uniformObj$vars <- vars
  uniformObj$oldRange <- oldRange
  uniformObj$newRange <- newRange
  uniformObj$encodeNA <- encodeNA
  uniformObj$params <- list()

  class(uniformObj) <- "uniformDefs"

  return(uniformObj)

}



#' @title gaussianEncode
#' @description
#' @param dt Dataset to create object on.
#' @param vars variables you want to include in the encoding.
#' @param minPerc Minimum percentage of the total population required for a group to be considered 'rare'.
#' For example, if \code{minPerc = 0.1}, and \code{Gender = 'Female'} only makes up 5% of the dataset, then
#' \code{Female} would be replaced with the string in \code{newString}.
#' @param minCount Can be provided instead of minPerc if you have a better idea of credibility in terms of total
#' samples required. Levels with counts below this value will be grouped into \code{newstring}.
#' @param newString Value you want to group rare levels together as.
#' @param encodeNA If FALSE, NAs will remain NA even if they would have been grouped.
#' If TRUE, NAs will be grouped, but only if they are 'rare'. If they are not rare, NAs will be untouched.
#' @return Rare Encoded Object
#' @export
gaussianEncode <- function(
    dt
  , vars
  , newMean = 0
  , newSD = 1
  , encodeNA = NA
) {

  # Testing
    # require(data.table)
    # dt <- readRDS("Data/numericEncodings.RDS")
    # vars <- c("LotFrontage","LotArea","GarageCars","BsmtFinSF2")

  oops <- which(!sapply(dt[,vars,with=FALSE],is.numeric))
  if(length(oops) > 0) stop(paste0("The following variables are not numeric or integer: ",names(oops)))


  # data fidelity
  if (any(!vars %in% names(dt))) stop(paste0(vars[!vars %in% names(dt)]," are not names in dt."))

  params <- lapply(vars,function(x) return(list(oldMean = mean(dt[[x]], na.rm = TRUE),oldSD = sd(dt[[x]], na.rm = TRUE))))
  names(params) <- vars

  gaussianObj <- list()
  gaussianObj$vars <- vars
  gaussianObj$newMean <- newMean
  gaussianObj$newSD <- newSD
  gaussianObj$params <- params
  gaussianObj$encodeNA <- encodeNA
  class(gaussianObj) <- "gaussianDefs"

  return(gaussianObj)

}

#' @title boxCoxEncode
#' @description
#' @param dt Dataset to create object on.
#' @param vars variables you want to include in the encoding.
#' @param lambda You can pass custom lambdas if you want. Not recommended.
#' @param minNormalize Box-Cox is a _risky_ transformation because it will fail if
#' it encounters a number <= 0. You can reduce this _riskyness_ by adding a certain amount of
#' 'space' between your expected range and 0. \code{minNormalize} represents the number of
#' standard deviations you want between 0 and the minimum number (lower bound) in the distribution.
#' This is set higher to ensure the variable never experiences a future number <= 0. Usually
#' safe being set pretty low if you have lots of data. If you have done some engineering
#' yourself to ensure this never happens, can be set to 0. All variables are automatically re-scaled,
#' Can either be a scalar or a named list of values, with names equal to vars.
#' @param capNegPredOutliers If you weren't careful enough with minNormalize and some
#' negative values end up coming through, do you want to cap them before they hit boxCox, or throw an error?
#' Safer to throw an error, so it's set to 0 by default. Then results in \code{applyEncoding}
#' trying to perform boxCox on 0, which will fail. If not 0, this number represents the number
#' of standard deviations above 0 that the numbers will be (min) capped at. Should be lower than minNormalize,
#' or the results will no longer be in the same order, since negative values will now be greater than the
#' minimum sample this encoding was created on.
#' @return BoxCox Encoded Object
#' @export
boxCoxEncode <- function(
    dt
  , vars
  , lambda = NULL
  , minNormalize = 0.05
  , capNegPredOutliers =0
) {

  # Testing
    # require(data.table)
    # dt <- readRDS("Data/numericEncodings.RDS")
    # vars <- c("LotFrontage","LotArea","GarageCars","BsmtFinSF2")
    # lambda <- 1
    # lambda <- NULL
    # minNormalize = 0.05
    # capNegPredOutliers <- 0.01

  # Data fidelity - damn there's a lot of checks for this one.
    oops <- which(!sapply(dt[,vars,with=FALSE],is.numeric))
    if(length(oops) > 0) stop(paste0("The following variables are not numeric or integer: ",names(oops)))

    # minNormalize Checks
      if(length(minNormalize) == 1 & !class(minNormalize) %in% c("numeric","integer")) stop("varMin must be a scalar or a named list of numbers.")
      if(!class(unlist(minNormalize)) %in% c("numeric","integer")) stop("minNormalize must be a scalar or a named list of numbers.")
      if(length(minNormalize) > 1) if(any(!names(minNormalize) %in% names(dt))) stop("names of minNormalize must be variables in dt.")
      if (length(minNormalize) == 1){
        if (!class(minNormalize) %in% c("numeric","integer")) stop("minNormalize needs to be a number.")
        minNormalize <- lapply(dt[,vars,with=FALSE],function(x) minNormalize)
      }
      if(any(unlist(minNormalize) < 0.001)) warning("minNormalize shouldn't be too low, this may cause wild transformations if values are close to 0.")


    if(!class(unlist(capNegPredOutliers)) %in% c("numeric","integer")) stop("capNegPredOutliers must be a scalar or a named list of numbers.")
    if(length(capNegPredOutliers) > 1) if(any(!names(capNegPredOutliers) %in% names(dt))) stop("names of capNegPredOutliers must be variables in dt.")
    if (length(capNegPredOutliers) == 1){
      if (!class(capNegPredOutliers) %in% c("numeric","integer")) stop("capNegPredOutliers needs to be a number.")
      capNegPredOutliers <- lapply(dt[,vars,with=FALSE],function(x) capNegPredOutliers)
    }
    if(any(unlist(minNormalize) < unlist(capNegPredOutliers))) stop(paste0("minNormalize needs to be greater than capNegPredOutliers."))


  # Calculate sd of distribution for each variable
  vecSD <- lapply(dt[,vars,with=FALSE],sd,na.rm=TRUE)

  # Calculate addition to range before it hits Box-Cox transformation.
  vecAdd <- lapply(vars,function(x) minNormalize[[x]]*vecSD[[x]] - min(dt[[x]],na.rm = TRUE))
  names(vecAdd) <- vars


  # If no lambda was passed, find lambda that results in 0 skewness of the variable's distribution
  if(is.null(lambda)) {

      boxCox <- function(x,lam,add) {
        if(lam==0) return(log(x+add)) else return(((x+add)^lam-1)/lam)
      }

      optimize <- function(lam,var) {
        e1071::skewness(boxCox(dt[[var]],lam,vecAdd[[var]]), na.rm = TRUE)^2
      }

      # var <- "LotFrontage"
      lambdas <- lapply(
          vars
        , function(var) optim(
              par = 0
            , fn = optimize
            , var = var
            , method = "L-BFGS-B"
            , lower = -5
            , upper = 5
          )$par
      )

      names(lambdas) <- vars

  } else if(length(lambda) == 1){

    if(!class(lambda) %in% c("numeric","integer")) stop("lambda must be a number")
    lambdas <- lapply(vars,function(x) lambda)
    names(lambdas) <- vars

  } else {

    if(length(lambda) != length(vars)) stop("What did you pass to lambda?????")
    if(!class(unist(lambda)) %in% c("numeric","integer")) stop("Lambda must be NULL, a named list of numbers, or a single number to use as lambda for all vars.")

    lambdas <- lambda

  }

  boxCoxObj <- list()
  boxCoxObj$vars <- vars
  boxCoxObj$vecAdd <- vecAdd
  boxCoxObj$vecSD <- vecSD
  boxCoxObj$lambdas <- lambdas
  boxCoxObj$minNormalize <- minNormalize
  boxCoxObj$capNegPredOutliers <- capNegPredOutliers

  class(boxCoxObj) <- "boxCoxDefs"
  return(boxCoxObj)

}


#' @title naEncode
#' @description Replaces NA and NaN values in \code{dt[vars]}. Will add a new level to factors.
#' @param dt dataset of class `data.table` or `data.frame`
#' @param naReplaceNumber replacement number for numerics.
#' @param naReplaceCharacter replacement string for characters and factors.
#' @param inPlace should the entire dataset be returned, or just the variables in \code{vars}
#' @param verbose Print results?
#'
#' @return
#' @export
#'
naEncode <- function(
  dt
  , vars
  , naReplaceNumber = NULL
  , naReplaceCharacter = NULL
  , inPlace = TRUE
  , verbose = T
){

  # Testing
    # require(data.table)
    # dt <- readRDS("Data/catEncoding.RDS")
    # dt$untouched <- rnorm(nrow(dt))
    # vars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
    # i <- "FoundationFactr"
    # i <- "Foundation"
    # dt$FireplaceQu <- factor(dt$FireplaceQu)


  # Returning vars or all columns.
  if (inPlace) {
    dt <- copy(dt)
  } else {
    dt <- copy(dt[,names(obj), with = FALSE])
  }

  # Make sure vars actually exist in dt.
  if (any(!vars %in% names(dt))) stop(paste0(vars[!vars %in% names(dt)]," are not names in dt."))

  # Replace NaN with NA
  dt[,(vars) := lapply(.SD,function(x){x[is.nan(x)] <- NA;return(x)}), .SDcols = vars]

  # Do this iteratively
  for(i in vars) {

    # Check for NAs
    misNum <- sum(is.na(dt[[i]]))
    if(misNum == 0) {
      if(verbose) cat("No missing values in",i,"no changes made to this variable.")
      next
    } else {if(verbose) cat("Imputing",misNum,"values in",i)}

    # If everything looks good, perform the character imputation
    if(class(dt[[i]]) %in% c("character","factor")) {
      if(is.null(naReplaceCharacter)) stop(paste0("Tried to impute ",i," but no naReplaceCharacter supplied."))
      dt[,(i) := ifelse(is.na(get(i)),naReplaceCharacter,as.character(get(i)))]
    } else if(class(dt[[i]]) %in% c("numeric","integer")) {
      if(is.null(naReplaceNumber)) stop(paste0("Tried to impute ",i," but no naReplaceNumber supplied."))
      dt[,(i) := ifelse(is.na(get(i)),naReplaceNumber,get(i))]
    }

  }

  return(dt)

}


#' @title dummyEncode
#' @description Collects data about levels and user preferences to transform dataset into dummy variables.
#' @param dt data.frame(table) to create the object on
#' @param vars vector of variables you want to dummify
#' @param treatNA A string that specifies what you want to do with NA values. It is basically never a good
#' idea to have NA dummy variables, so that is not an option. Options are:
#' \itemize{
#'   \item \code{"newLevel"}   Simply creates a new level, which will be set to 1 when the variable is NA
#'   \item \code{"ghost"}      Sets all levels to 0. The information that this variable was NA is encoded into the data by the fact that none of the other levels are equal to 1. Bad idea for linear models.
#' }
#' @param sep The seperator between variable and level in the new column names. "." is safe usually.
#' @param setNA What to replace NA with if \code{treatNA = "newLevel"}.
#' @param fullRank Boolean. Copies carat syntax. If TRUE, the least common level is dropped so that linear models won't return blown up (yet valid) coefficients. A good conversation: https://stats.stackexchange.com/questions/231285/dropping-one-of-the-columns-when-using-one-hot-encoding
#' @param levelCountThresh Did you try to one-hot encode a floating point column? If your level count exceeds this value, the process stops and let you know which column it was.
#' @param values What to encode the values as. Should be a numeric vector of the form c(False value,Positive value).
#' Default one-hot encoding values are c(0,1). Sometimes, it is useful to encode as c(-1,1) for certain
#' NN activation functions.
#' @return Frequency Encoded Object. This needs to be applied. It will not actually return a dataset.
#' @importFrom utils stack
#' @importFrom data.table data.table setnames :=
#' @export
dummyEncode <- function(
    dt
  , vars
  , treatNA = c("newLevel","ghost")
  , sep = "."
  , setNA = "na"
  , fullRank = TRUE
  , levelCountThresh = 50
  , values = c(0,1)
) {

  # Testing
    # require(data.table)
    # dt <- catEncoding
    # dt$untouched <- rnorm(nrow(dt))
    # vars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
    # treatNA <-  c("newLevel","ghost")
    # setNA = "na"
    # levelCountThresh = 50
    # fullRank = TRUE
    # sep = "."


  # data.table > data.frame
  dt <- copy(dt[,vars, with = FALSE])

  # The initialized vector is just to easily show the available options. Take the first one.
  treatNA <- treatNA[[1]]

  # Error handling. Add to this as you hit common user errors
  if (any(!vars %in% names(dt))) stop(paste0(vars[!vars %in% names(dt)]," are not names in dt."))

  # Useful Info
  lens <- sapply(dt[,vars,with=FALSE],function(x) length(unique(x)))

  # Check for levelCountThresh
  tooBig <- which(lens > levelCountThresh)
  if (length(tooBig) > 0) stop(paste0("The following variables have unique level counts exceeding levelCountThresh: ",paste0(names(tooBig),collapse = ", ")))

  # Don't dummy a variable with only 1 level.
  tooSmall <- which(lens < 2)
  if (length(tooSmall) > 0) stop(paste0("The following variables have only 1 level, please fix: ",paste0(names(tooSmall),collapse = ", ")))

  # Factors make my life a living hell
  dt[,(vars) := lapply(.SD, as.character), .SDcols = vars]

  # Impute missing vars.
  if (treatNA == "newLevel") for(j in vars) set(dt,which(is.na(dt[[j]])),j,setNA) # Ridiculously fast https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table

  # Get distinct values and counts
  freqs <- frequencyEncode(dt, vars = names(dt), encodeNA = TRUE)

  # Edit freq object - this setup is not too bad, small adjustment needed for what dummyVars expects
  # Need to filter out anything that doesn't actually exist.
  freqs$tables <- lapply(freqs$tables, function(x) x[freq > 0,])

  # If using fullRank, remove the least common level.
  if (fullRank == TRUE) freqs$tables <- lapply(freqs$tables, function(x) head(x,-1))

  # Get unique names - store as a vector.
  unqs <- lapply(freqs$tables, function(x) x[[2]])

  # If ghosting the NAs, just remove them from the possible values.
  if (treatNA == "ghost") unqs <- lapply(unqs, function(x) x[!is.na(x)])

  # Get new col names
  lvlNames <- lapply(names(unqs), function(x) paste(x, unqs[[x]], sep = sep))
  names(lvlNames) <- names(unqs)
  newNames <- as.character(unlist(lvlNames))

  # Define and return dummyDefs object
  ret <- list(
      newNames = newNames
    , lvlNames = lvlNames
    , oldNames = vars
    , sep = sep
    , Uniques = unqs
    , treatNA = treatNA
    , setNA = setNA
    , values = values
  )
  class(ret) <- "dummyDefs"
  return(ret)
}


#' @title applyEncoding
#' @description Applies an encoding object to a dataset. Only returns dataset.
#' @param dt dataset
#' @param obj encoding object you want to apply to \code{dt}
#' @param inPlace Return the entire dt or just the transformed columns
#' @return A dataset that has been transformed by an encoding object.
#' @importFrom data.table copy
#' @export
applyEncoding <- function(
    dt
  , obj
  , inPlace = TRUE
) {

  # Testing
    # require(data.table)
    # dt <- readRDS("Data/catEncoding.RDS")
    # dt$untouched <- rnorm(nrow(dt))
    # vars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
    # obj <- frequencyEncode(dt,vars,encodeNA = TRUE)
    # obj <- rareEncode(dt,vars,minPerc = 0.05,encodeNA = TRUE)
    # i <- obj$vars[[3]]


  cl <- class(obj)
  if(!cl %in% c("freqDefs","rareDefs","dummyDefs","uniformDefs","gaussianDefs","boxCoxDefs")) stop("Object not recognized. Did you pass your object as the first argument?")

  # MANIPULATIONS FOR frequencyEncode AND rareEncode OBJECTS.
  # THESE ARE SIMPLE ENOUGH TO NOT REQUIRE THEIR OWN APPLY FUNCTIONS.

  if(cl %in% c("freqDefs","rareDefs")) z <- applyRFEncoding(dt,obj,inPlace)

  if(cl == "uniformDefs") z <- applyUniform(dt,obj,inPlace)

  if (cl == "dummyDefs") z <- applyDummy(dt,obj,inPlace)

  if (cl == "gaussianDefs") z <- applyGaussian(dt,obj,inPlace)

  if (cl == "boxCoxDefs") z <- applyBoxCox(dt,obj,inPlace)

  return(z)

}

applyBoxCox <- function(
    dt
  , obj
  , inPlace
) {

  # Testing
    # obj <- boxCoxEnc
    # dt <- numericEncodings
    # inPlace = FALSE

  # Data fidelity
  if (any(!obj$vars %in% names(dt))) stop(paste0(obj$vars[!obj$vars %in% names(dt)]," are not names in dt."))

  # Specify columns to return
  if (inPlace) {
    dt <- copy(dt)
  } else {
    dt <- copy(dt[,obj$vars, with = FALSE])
  }

  boxCox <- function(y,lam,add,SD,cap,var) {

    # y <- dt[[x]]
    # lam <- obj$lambdas[[x]]
    # add <- obj$vecAdd[[x]]
    # SD <- obj$vecSD[[x]]
    # cap <- obj$capNegPredOutliers[[x]]
    # var <- x

    y1 <- pmax(y + add,cap*SD)

    if(any(y1 <= 0,na.rm = TRUE)) stop(paste0("Tried to apply box-cox on value <= 0 for variable ",var))

    if(lam==0) return(log(y1)) else return((y1^lam-1)/lam)

  }

  # Perform boxcox on vars.
  # x <- names(dt)[[1]]
  dt[,(obj$vars) := lapply(obj$vars,function(x) {

    boxCox(dt[[x]],obj$lambdas[[x]],obj$vecAdd[[x]],obj$vecSD[[x]],obj$capNegPredOutliers[[x]],x)

  })]

  return(dt)

}

applyUniform <- function(
    dt
  , obj
  , inPlace
) {

  # Testing
    # obj <- uniEncod
    # dt <- numericEncodings
    # inPlace <- TRUE
    # inPlace <- FALSE

  dt <- copy(dt)

  if (any(!obj$vars %in% names(dt))) stop(paste0(obj$oldNames[!obj$oldNames %in% names(dt)]," were included in the uniform object, but are not column names in dt."))

  # Make the obvious warning
  outsideRange <- sapply(obj$vars,function(x) length(which(dt[[x]] < obj$oldRange[[x]][[1]] | dt[[x]] > obj$oldRange[[x]][[2]]))) > 0
  if(any(outsideRange)) warning(paste0("The following variables had values in the data outside the oldRange. This will result in values outside newRange: ",paste0(names(outsideRange)[outsideRange],collapse = ", ")))

  # Apply linear transformation - This could _definitely_ be made prettier.
  dt[,(obj$vars) := lapply(obj$vars,function(x) {
    or <- obj$oldRange[[x]]
    nr <- obj$newRange[[x]]
    orr <- or[[2]]-or[[1]]
    nrr <- nr[[2]]-nr[[1]]
    (dt[[x]] - or[[1]]) / orr * nrr + nr[[1]]
    }
  )]

  if(inPlace) return(dt) else return(dt[,obj$vars,with=FALSE])

}

applyDummy <- function(
    dt
  , obj
  , inPlace
) {

  # Testing
    # obj <- dummyEnc2
    # dt <- copy(catEncWithNewLevels)
    # inPlace <- FALSE


  dt <- copy(dt)

  # Data fidelity
  if (any(!obj$oldNames %in% names(dt))) stop(paste0(obj$oldNames[!obj$oldNames %in% names(dt)]," were included in the dummy object, but are not column names in dt."))

  # Factors make my life a living hell
  dt[,(obj$oldNames) := lapply(.SD, as.character), .SDcols = obj$oldNames]

  if (obj$treatNA == "newLevel") {

    for(j in obj$oldNames) set(dt,which(is.na(dt[[j]])),j,obj$setNA) # Ridiculously fast https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table

  }

  colOver <- lapply(obj$newNames, function(x) {
    loc <- regexpr(obj$sep, x, fixed = TRUE)
    return(c(substr(x,1,loc-1),substr(x,loc+1,nchar(x))))
  })

  # Ghosting the NAs just means every dummy column will be 0. No new level
  # is created, the information is just encoded as "not true" (0) for all columns.
  if (obj$treatNA == "ghost") {
    cols <- lapply(colOver, function(x) {
      j <- ifelse(dt[[x[[1]]]] == x[[2]],obj$values[[2]],obj$values[[1]])
      return(ifelse(is.na(j),obj$values[[1]],j))
    })
  } else {
    cols <- lapply(colOver, function(x) {
      ifelse(dt[[x[[1]]]] == x[[2]],obj$values[[2]],obj$values[[1]])
    })
  }

  setDT(cols)
  setnames(cols, obj$newNames)

  if (inPlace) {
    return(cbind(dt,cols))
  } else {
    return(cols)
  }
}

applyRFEncoding <- function(
    dt
  , obj
  , inPlace
) {

  # Testing
    # dt <- catEncoding
    # obj <- freqEncod_TRUE
    # inPlace = TRUE
    # dt <- rbindlist(list(dt,list("New","New","-1","Street","New",0)))

  # Specify columns to return
  if (inPlace) {
    dt <- copy(dt)
  } else {
    if (any(!obj$vars %in% names(dt))) stop(paste0(obj$vars[!obj$vars %in% names(dt)]," are not names in dt."))
    dt <- copy(dt[,obj$vars, with = FALSE])
  }

  for (i in obj$vars) {

    # i <- obj$vars[[2]]

    if(!class(dt[[i]]) %in% c("character")) dt[,(i) := as.character(get(i))]


    if(any(!unique(dt[[i]]) %in% obj$tables[[i]][[i]])) {
      if(obj$allowNewLevels) {
        warning(paste0("WARNING: NEW LEVEL DETECTED IN VARIABLE ",i,". allowNewLevels IS SET TO TRUE, SO THESE WILL BE ENCODED AS newString or -1."))
        dt[[i]][which(!dt[[i]] %in% obj$tables[[i]][[i]])] <- "__NEWLEVEL__"
      } else {
        stop(paste0("NEW LEVEL DETECTED IN VARIABLE ",i,". allowNewLevels IS SET TO FALSE, PROCESS STOPPING."))
      }
    }

    enc <- merge(
      x = dt[,(i),  with = FALSE]
      , y = obj$tables[[i]][,-"freq"]
      , sort = FALSE
      , all.x = TRUE
    )

    dt[,(i) := enc$enc]

  }

  return(dt)

}

applyGaussian <- function(
    dt
  , obj
  , inPlace
) {

  # Testing
    # obj <- gaussEnc
    # dt <- readRDS("Data/numericEncodings.RDS")
    # inPlace <- TRUE

  if (any(!obj$vars %in% names(dt))) stop(paste0(obj$vars[!obj$vars %in% names(dt)]," are not names in dt."))

  # Specify columns to return
  if (inPlace) {
    dt <- copy(dt)
  } else {
    dt <- copy(dt[,obj$vars, with = FALSE])
  }

  dt[,(obj$vars) := lapply(obj$vars,function(x) (dt[[x]] - obj$params[[x]]$oldMean)/(obj$params[[x]]$oldSD / obj$newSD) + obj$newMean)]

  return(dt)

}
AnotherSamWilson/helperFuncs documentation built on Oct. 1, 2019, 8:51 p.m.