Support/Scripts/testEncodings.R

rm(list = ls())
require(data.table)
catEncoding <- readRDS("Support/Data/catEncoding.RDS")
catEncoding$untouched <- rnorm(nrow(catEncoding))
catVars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")


numericEncodings <- readRDS("Support/Data/numericEncodings.RDS")
numerVars <- c("LotFrontage","LotArea","GarageCars","BsmtFinSF2")

source("R/encodings.R")


# FREQUENCY ENCODING
  freqEncod_TRUE <- frequencyEncode(catEncoding,catVars,encodeNA = TRUE, allowNewLevels = TRUE)
  freqEncod_FALSE <- frequencyEncode(catEncoding,catVars,encodeNA = FALSE, allowNewLevels = FALSE)
  freqEncod_MIX1 <- frequencyEncode(catEncoding,catVars,encodeNA = TRUE, allowNewLevels = FALSE)
  freqEncod_MIX2 <- frequencyEncode(catEncoding,catVars,encodeNA = FALSE, allowNewLevels = TRUE)

  # Create DT with new Levels
  catEncWithNewLevels <- rbindlist(list(catEncoding,list("New","TA",10,"Pave","New",0)))

  freqDT_TRUE <- applyEncoding(catEncWithNewLevels,freqEncod_TRUE)     # Throw warnings
  freqDT_FALSE <- applyEncoding(catEncWithNewLevels,freqEncod_FALSE)   # Throw Error
  freqDT_MIX1 <- applyEncoding(catEncWithNewLevels,freqEncod_MIX1)     # Throw Error
  freqDT_MIX2 <- applyEncoding(catEncWithNewLevels,freqEncod_MIX2)     # Throw Warnings


# RARE ENCODINGS - NAs will only be encoded if they are actually rare. Otherwise, just use naEncode.
  rareEncod_TRUE <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = TRUE, allowNewLevels = TRUE)
  rareEncod_FALSE <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = FALSE, allowNewLevels = FALSE)
  rareEncod_MIX1 <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = TRUE, allowNewLevels = FALSE)
  rareEncod_MIX2 <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = FALSE, allowNewLevels = TRUE)

  rareDT_TRUE <- applyEncoding(catEncWithNewLevels, rareEncod_TRUE)    # Throw a Warning
  rareDT_FALSE <- applyEncoding(catEncWithNewLevels, rareEncod_FALSE)  # Error
  rareDT_MIX1 <- applyEncoding(catEncWithNewLevels, rareEncod_MIX1)    # Error
  rareDT_MIX2 <- applyEncoding(catEncWithNewLevels, rareEncod_MIX2)    # Warning



# UNIFORM ENCODING
  uniEncod <- uniformEncode(
    numericEncodings
    , vars = numerVars
  )
  uniDT <- applyEncoding(numericEncodings,uniEncod, inPlace = TRUE)


# GAUSSIAN ENCODING
  gaussEnc <- gaussianEncode(numericEncodings,numerVars)
  gaussEnc <- gaussianEncode(numericEncodings,numerVars,newMean = 10,newSD = 10)
  gaussDT <- applyEncoding(numericEncodings,gaussEnc)
  lapply(gaussDT[,gaussEnc$vars,with=FALSE],mean,na.rm=TRUE)
  lapply(gaussDT[,gaussEnc$vars,with=FALSE],sd,na.rm=TRUE)


# BoxCox ENCODING

  boxCoxEnc <- boxCoxEncode(numericEncodings,numerVars)
  boxCoxDT <- applyEncoding(numericEncodings,boxCoxEnc)

  require(gridExtra)
  require(gridExtra,quietly = TRUE)
  p1 <- ggplot(boxCoxDT[!is.na(LotArea)], aes(x = LotArea)) + geom_density() + ggtitle("Transformed With Box-Cox  |  Skewness = 0")
  p2 <- ggplot(numericEncodings[!is.na(LotArea)], aes(x = LotArea)) + geom_density() + ggtitle("Original Data  |  Skewness = 12.18")
  grid.arrange(p2,p1, ncol = 1)


  boxCoxEnc <- boxCoxEncode(numericEncodings,numerVars,minNormalize = list(LotFrontage = 1, LotArea = 0.05, GarageCars = 0.05, BsmtFinSF2 = 0.05))

  lapply(numericEncodings[,numerVars,with=FALSE],min,na.rm=TRUE)

# DUMMY ENCODING
  dummyEnc1 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
  dummyEnc2 <- dummyEncode(catEncoding,catVars,"ghost",".","na",TRUE,50)
  dummyEnc3 <- dummyEncode(catEncoding,catVars,"ghost",".","na",FALSE,50)
  dummyEnc4 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
  dummyEnc5 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
  dummyTdt1 <- applyEncoding(catEncoding,dummyEnc1)
  dummyTdt2 <- applyEncoding(catEncoding,dummyEnc2)
  dummyTdt3 <- applyEncoding(catEncoding,dummyEnc3)
  dummyTdt4 <- applyEncoding(catEncoding,dummyEnc4)
  dummyTdt5 <- applyEncoding(catEncoding,dummyEnc5)
  if(sum(dummyTdt2[is.na(catEncoding$FireplaceQu),grep("FireplaceQu",dummyEnc2$newNames,value=T),with=FALSE])>0) stop("treatNA = 'ghost' failed on fullRank = T")
  if(sum(dummyTdt3[is.na(catEncoding$FireplaceQu),grep("FireplaceQu",dummyEnc3$newNames,value=T),with=FALSE])>0) stop("treatNA = 'ghost' failed on fullRank = F.")

  # View effect on variable with NAs
  dummyTdt1[,grep("FireplaceQu",dummyEnc1$newNames,value=T),with=FALSE]
  dummyTdt2[,grep("FireplaceQu",dummyEnc2$newNames,value=T),with=FALSE]
  dummyTdt3[,grep("FireplaceQu",dummyEnc3$newNames,value=T),with=FALSE]
  dummyTdt4[,grep("FireplaceQu",dummyEnc4$newNames,value=T),with=FALSE]
  dummyTdt5[,grep("FireplaceQu",dummyEnc5$newNames,value=T),with=FALSE]


  # Create DT with new Levels
  catEncWithNewLevels <- rbindlist(list(catEncoding,list("New","TA",10,"Pave","New",0)))

  # View Effect that a new level has on encoding:
  dummyTdt1 <- applyEncoding(catEncWithNewLevels,dummyEnc1)



  catEncoding2 <- catEncoding[sample(1:nrow(catEncoding), size = 1000000, replace = TRUE)]

  system.time(
    {
      dummyEnc1 <- dummyEncode(catEncoding2, catVars)
      dummyDT1 <- applyEncoding(catEncoding2, dummyEnc1)
    }
  )

  require(caret) # Caret package must be loaded to use dummyVars
  system.time(
    {
      caretDummy <- dummyVars(~.,catEncoding2[,catVars,with=FALSE])
      caretDumDT <- predict(caretDummy,catEncoding2)
      caretDumDT <- as.data.table(caretDumDT)
    }
  )
AnotherSamWilson/helperFuncs documentation built on Oct. 1, 2019, 8:51 p.m.