rm(list = ls())
require(data.table)
catEncoding <- readRDS("Support/Data/catEncoding.RDS")
catEncoding$untouched <- rnorm(nrow(catEncoding))
catVars <- c("Foundation","FireplaceQu","GarageCars","Street","FoundationFactr")
numericEncodings <- readRDS("Support/Data/numericEncodings.RDS")
numerVars <- c("LotFrontage","LotArea","GarageCars","BsmtFinSF2")
source("R/encodings.R")
# FREQUENCY ENCODING
freqEncod_TRUE <- frequencyEncode(catEncoding,catVars,encodeNA = TRUE, allowNewLevels = TRUE)
freqEncod_FALSE <- frequencyEncode(catEncoding,catVars,encodeNA = FALSE, allowNewLevels = FALSE)
freqEncod_MIX1 <- frequencyEncode(catEncoding,catVars,encodeNA = TRUE, allowNewLevels = FALSE)
freqEncod_MIX2 <- frequencyEncode(catEncoding,catVars,encodeNA = FALSE, allowNewLevels = TRUE)
# Create DT with new Levels
catEncWithNewLevels <- rbindlist(list(catEncoding,list("New","TA",10,"Pave","New",0)))
freqDT_TRUE <- applyEncoding(catEncWithNewLevels,freqEncod_TRUE) # Throw warnings
freqDT_FALSE <- applyEncoding(catEncWithNewLevels,freqEncod_FALSE) # Throw Error
freqDT_MIX1 <- applyEncoding(catEncWithNewLevels,freqEncod_MIX1) # Throw Error
freqDT_MIX2 <- applyEncoding(catEncWithNewLevels,freqEncod_MIX2) # Throw Warnings
# RARE ENCODINGS - NAs will only be encoded if they are actually rare. Otherwise, just use naEncode.
rareEncod_TRUE <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = TRUE, allowNewLevels = TRUE)
rareEncod_FALSE <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = FALSE, allowNewLevels = FALSE)
rareEncod_MIX1 <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = TRUE, allowNewLevels = FALSE)
rareEncod_MIX2 <- rareEncode(catEncoding,catVars,minPerc = 0.05, encodeNA = FALSE, allowNewLevels = TRUE)
rareDT_TRUE <- applyEncoding(catEncWithNewLevels, rareEncod_TRUE) # Throw a Warning
rareDT_FALSE <- applyEncoding(catEncWithNewLevels, rareEncod_FALSE) # Error
rareDT_MIX1 <- applyEncoding(catEncWithNewLevels, rareEncod_MIX1) # Error
rareDT_MIX2 <- applyEncoding(catEncWithNewLevels, rareEncod_MIX2) # Warning
# UNIFORM ENCODING
uniEncod <- uniformEncode(
numericEncodings
, vars = numerVars
)
uniDT <- applyEncoding(numericEncodings,uniEncod, inPlace = TRUE)
# GAUSSIAN ENCODING
gaussEnc <- gaussianEncode(numericEncodings,numerVars)
gaussEnc <- gaussianEncode(numericEncodings,numerVars,newMean = 10,newSD = 10)
gaussDT <- applyEncoding(numericEncodings,gaussEnc)
lapply(gaussDT[,gaussEnc$vars,with=FALSE],mean,na.rm=TRUE)
lapply(gaussDT[,gaussEnc$vars,with=FALSE],sd,na.rm=TRUE)
# BoxCox ENCODING
boxCoxEnc <- boxCoxEncode(numericEncodings,numerVars)
boxCoxDT <- applyEncoding(numericEncodings,boxCoxEnc)
require(gridExtra)
require(gridExtra,quietly = TRUE)
p1 <- ggplot(boxCoxDT[!is.na(LotArea)], aes(x = LotArea)) + geom_density() + ggtitle("Transformed With Box-Cox | Skewness = 0")
p2 <- ggplot(numericEncodings[!is.na(LotArea)], aes(x = LotArea)) + geom_density() + ggtitle("Original Data | Skewness = 12.18")
grid.arrange(p2,p1, ncol = 1)
boxCoxEnc <- boxCoxEncode(numericEncodings,numerVars,minNormalize = list(LotFrontage = 1, LotArea = 0.05, GarageCars = 0.05, BsmtFinSF2 = 0.05))
lapply(numericEncodings[,numerVars,with=FALSE],min,na.rm=TRUE)
# DUMMY ENCODING
dummyEnc1 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
dummyEnc2 <- dummyEncode(catEncoding,catVars,"ghost",".","na",TRUE,50)
dummyEnc3 <- dummyEncode(catEncoding,catVars,"ghost",".","na",FALSE,50)
dummyEnc4 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
dummyEnc5 <- dummyEncode(catEncoding,catVars,"newLevel",".","na",TRUE,50)
dummyTdt1 <- applyEncoding(catEncoding,dummyEnc1)
dummyTdt2 <- applyEncoding(catEncoding,dummyEnc2)
dummyTdt3 <- applyEncoding(catEncoding,dummyEnc3)
dummyTdt4 <- applyEncoding(catEncoding,dummyEnc4)
dummyTdt5 <- applyEncoding(catEncoding,dummyEnc5)
if(sum(dummyTdt2[is.na(catEncoding$FireplaceQu),grep("FireplaceQu",dummyEnc2$newNames,value=T),with=FALSE])>0) stop("treatNA = 'ghost' failed on fullRank = T")
if(sum(dummyTdt3[is.na(catEncoding$FireplaceQu),grep("FireplaceQu",dummyEnc3$newNames,value=T),with=FALSE])>0) stop("treatNA = 'ghost' failed on fullRank = F.")
# View effect on variable with NAs
dummyTdt1[,grep("FireplaceQu",dummyEnc1$newNames,value=T),with=FALSE]
dummyTdt2[,grep("FireplaceQu",dummyEnc2$newNames,value=T),with=FALSE]
dummyTdt3[,grep("FireplaceQu",dummyEnc3$newNames,value=T),with=FALSE]
dummyTdt4[,grep("FireplaceQu",dummyEnc4$newNames,value=T),with=FALSE]
dummyTdt5[,grep("FireplaceQu",dummyEnc5$newNames,value=T),with=FALSE]
# Create DT with new Levels
catEncWithNewLevels <- rbindlist(list(catEncoding,list("New","TA",10,"Pave","New",0)))
# View Effect that a new level has on encoding:
dummyTdt1 <- applyEncoding(catEncWithNewLevels,dummyEnc1)
catEncoding2 <- catEncoding[sample(1:nrow(catEncoding), size = 1000000, replace = TRUE)]
system.time(
{
dummyEnc1 <- dummyEncode(catEncoding2, catVars)
dummyDT1 <- applyEncoding(catEncoding2, dummyEnc1)
}
)
require(caret) # Caret package must be loaded to use dummyVars
system.time(
{
caretDummy <- dummyVars(~.,catEncoding2[,catVars,with=FALSE])
caretDumDT <- predict(caretDummy,catEncoding2)
caretDumDT <- as.data.table(caretDumDT)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.