runs/CW.R

# New varibale: Distance from most populated area
#               Distance from nearest area with density above x

options(scipen = 10, digits = 5)
dtrain <- fread("zcat ./input/train.gz")

define_cat_feats(dtrain, saveEnv = FALSE, minInst = 500)

ord_feats <- names(dtrain)[ !(names(dtrain) %in% pkg_env$cat_vars) ][ -(1:3) ]

# simple features
prepeare_features(dtrain)
prepeare_special_features(dtrain)

# ................advanced features.................................
# dtrain[, IDW := invDistWeight(longY = longitude / 1e6 * pi / 180,
#                               latY = latitude / 1e6 * pi / 180,
#                               longX = longitude / 1e6 * pi / 180,
#                               latX = latitude / 1e6 * pi / 180,
#                               X = logerror)]
# saveRDS(dtrain$IDW, "./input/IDW.rds")
dtrain  <- dtrain[, IDW := readRDS("./input/IDW.rds")]



dtrain[, cor(logerror, IDW)]

names(dtrain)


n <- nrow(dtrain)

feats <- names(dtrain)[!( names(dtrain) %in% c("parcelid", "logerror", "transactiondate", pkg_env$cat_vars) )]
###########################
# models_0 = list(
#                   rf             = list( fitfunc = fitRF,
#                                          par0 = NULL),
#                   xgbrmse         = list( fitfunc = fitXGB,
#                                               par0 = list(eval_metric = "rmse",
#                                                           type = "XGBREG",
#                                                           colsample_bytree = 0.75,
#                                                           eta = 0.003,
#                                                           min_child_weight = 50)
#                   ),
#                   s21              = list( fitfunc = fitTwoStage,
#                                           par0 = list(stage1 = list(rlm      = list(fitfunc = fitRLM,
#                                                                                    par0 = NULL),
#                                                                     lasso    = list(fitfunc = fitLASSO,
#                                                                                    par0 = NULL)
#                                                                     ),
#                                                      stage2 = list(fitfunc = fitXGB,
#                                                                    par0 = list(nrounds = NULL,
#                                                                                eval_metric = "mae",
#                                                                                type = "XGBREG",
#                                                                                subsample = .5,
#                                                                                eta = 0.007,
#                                                                                min_child_weight = 50)
#                                                                    )
#                                                      )
#                                          ),
#                   s22              = list( fitfunc = fitTwoStage,
#                                           par0 = list(stage1 = list(xgbabs   = list( fitfunc = fitXGB,
#                                                                                      par0 = list(eval_metric = "mae",
#                                                                                                  type = "XGBREG",
#                                                                                                   trans = list(type = "abs"),
#                                                                                                   subsample = .5,
#                                                                                                   eta = 0.003,
#                                                                                                   min_child_weight = 50)
#                                                                     ),
#                                                                     lasso    = list(fitfunc = fitLASSO,
#                                                                                     par0 = NULL)
#                                           ),
#                                           stage2 = list(fitfunc = fitXGB,
#                                                         par0 = list(nrounds = NULL,
#                                                                     eval_metric = "mae",
#                                                                     type = "XGBREG",
#                                                                     subsample = .5,
#                                                                     eta = 0.004,
#                                                                     trans = list(type = "div", var = "xgbabs"),
#                                                                     min_child_weight = 50)
#                                           )
#                                           )
#                   ),
#                   s23              = list( fitfunc = fitTwoStage,
#                                           par0 = list(stage1 = list(rlm      = list(fitfunc = fitRLM,
#                                                                                     par0 = NULL),
#                                                                     lasso    = list(fitfunc = fitLASSO,
#                                                                                     par0 = NULL)
#                                           ),
#                                           stage2 = list(fitfunc = fitXGB,
#                                                         par0 = list(nrounds = NULL,
#                                                                     eval_metric = "mae",
#                                                                     type = "XGBREG",
#                                                                     subsample = .5,
#                                                                     eta = 0.007,
#                                                                     trans = list(type = "diff", var = "lasso"),
#                                                                     min_child_weight = 50)
#                                           )
#                                           )
#                   ),
#                   xgbregmae       = list( fitfunc = fitXGB,
#                                           par0 = list(eval_metric = "mae",
#                                                       type = "XGBREG",
#                                                       subsample = .5,
#                                                       eta = 0.007,
#                                                       min_child_weight = 50)
#                   ),
#                   lasso           = list( fitfunc = fitLASSO,
#                                            par0 = NULL)
# )
# models_1 = list(xgblog   = list(fitfunc = fitXGB,
#                                 par0 = list(
#                                   type = "XGBLOG"
#                                 ))
#                 )


new_lasso_feats <- lapply(1:12, function(i)list(feat = paste0("IDW", i), func = `*`, v1 = "IDW", v2 = paste0("m", i)))
f4 <- c("calculatedfinishedsquarefeet", "landtaxvaluedollarcnt", "taxvaluedollarcnt_DivBy_taxamount",
        "structuretaxvaluedollarcnt_DivBy_calculatedfinishedsquarefeet")
nf4 <- lapply(f4, function(s)list(feat = paste0(s, "_SQRT"), func = function(a, b)a^(0.5), v1 = s, v2 = NULL))
models_0 <- list(lassoo1 = list(fitfunc = fitLASSO,
                               par0 = list(mF = new_lasso_feats,
                                           feats = sapply(new_lasso_feats, function(l)l$feat)
                                           )
                 ),
                 lassoo2 = list(fitfunc = fitLASSO,
                               par0 = list(mF = list(list(feat = "absIDW", func = function(a, b)abs(a), v1 = "IDW", v2 = NULL)),
                                           feats = c(paste0("m", 1:12), "IDW", "absIDW"))

                 ),
                 lassoo3 = list(fitfunc = fitLASSO,
                               par0 = list(feats = names(dtrain)[grepl("_IS_", names(dtrain))])

                 ),
                 lassoo4 = list(fitfunc = fitLASSO,
                                par0 = list(mF = nf4,
                                            feats = f4)

                 ),
                 lassoo5 = list(fitfunc = fitLASSO,
                                par0 = NULL

                 ),
                  obj              = list( fitfunc = fitTwoStage,
                                            par0 = list(stage1 = list(
                                            lassoo1 = list(fitfunc = fitLASSO,
                                                           par0 = list(mF = new_lasso_feats,
                                                           feats = sapply(new_lasso_feats, function(l)l$feat))

                                            ),
                                            lassoo2 = list(fitfunc = fitLASSO,
                                                           par0 = list(mF = list(list(feat = "absIDW", func = function(a, b)abs(a), v1 = "IDW", v2 = NULL)),
                                                                       feats = c(paste0("m", 1:12), "IDW", "absIDW"))

                                            ),
                                            lassoo3 = list(fitfunc = fitLASSO,
                                                           par0 = list(feats = names(dtrain)[grepl("_IS_", names(dtrain))])

                                            ),
                                            lassoo4 = list(fitfunc = fitLASSO,
                                                           par0 = list(mF = nf4,
                                                                       feats = f4)

                                            ),
                                            lassoo5 = list(fitfunc = fitLASSO,
                                                           par0 = NULL

                                            )
                         ),
                         stage2 = list(fitfunc = fitXGB,
                                       par0 = list(nrounds = NULL,
                                                   eval_metric = "mae",
                                                   type = "XGBREG",
                                                   subsample = .75,
                                                   eta = 0.01,
                                                   min_child_weight = 5,
                                                   colsample_bytree = 1,
                                                   feats = c(paste0("lassoo", 1:5), "longitude", "latitude"))
                         )
                         )
)
)

models_1 <- NULL

models_0$xgbregmae$par0$subsample = .5
models_0$xgbregmae$par0$eta = .007
models_0$xgbregmae$par0$min_child_weight = 50

set.seed(2)
iin <- sample(1:n, floor(2 * n / 3))
iin <- sample(1:n, floor(n * 0.5))


l_cw <- doIt(dtrain[iin], sX = feats, sY = "logerror", seed = 12,
             models0 = models_0, models1 = models_1)
l_cw$basemodels$ind.prc
l_cw$basemodels$mp.prc


# rlm        s2 xgbregmae     lasso
# 0.007132  0.014290  0.017073  0.011390
#
# > l_cw$basemodels$mp.prc
# [1] 0.016015
#####################################################################################################################
# 0.01672
# cw_par(TD = dtrain, sX = feats, sY = "logerror",
#        models_0 = models_0, models_1 = models_1,
#        parpath = c("xgbregmae", "par0","eta"), newpars = c(.1, .5))
# ############################################
# cw_xgb_par(TD = dtrain, min_gains <- c(NA, 1/10000, 1/1000, 1/100, 5/100), n_best = 10,
#                  sX = feats, sY = "logerror", models_0 = models_0, models_1 = models_1)

# f1 <- function(X) {
#   newfeatZ <- NULL
#
#   for(v in pkg_env$cat_vars){
#     newfeat <- paste0(v, "_n")
#     u <- unique(X[[v]])
#     for(u0 in u)X[v == u0,  (newfeat) := sum(get(v) == u0)]
#     newfeatZ <- c(newfeatZ, newfeat)
#   }
#
#   newfeatZ
# }
#
#
# cw_xgb_par(TD = dtrain, min_gains = NULL, n_best = 10,
#                sX = feats, sY = "logerror", models_0 = models_0, models_1 = models_1)
#
# cw_xgb_par(TD = dtrain, min_gains = NULL, n_best = 10,
#                sX = feats, sY = "logerror", models_0 = models_0, models_1 = models_1,
#                make_feat_func = f1, reintroduce = FALSE)
steinarv/k1 documentation built on Oct. 19, 2017, 4:41 a.m.