runs/crossvalidation.R

options(scipen = 10, digits = 5)
dtrain <- fread("zcat ./input/train.gz")

# define_cat_feats(dtrain)

ord_feats <- names(dtrain)[ !(names(dtrain) %in% pkg_env$cat_vars) ][ -(1:3) ]

# simple features
prepeare_features(dtrain)
prepeare_special_features(dtrain)

# ................advanced features.................................
# dtrain[, IDW := invDistWeight(longY = longitude / 1e6 * pi / 180,
#                               latY = latitude / 1e6 * pi / 180,
#                               longX = longitude / 1e6 * pi / 180,
#                               latX = latitude / 1e6 * pi / 180,
#                               X = logerror)]
# saveRDS(dtrain$IDW, "./input/IDW.rds")
dtrain  <- dtrain[, IDW := readRDS("./input/IDW.rds")]
dtrain[, cor(logerror, IDW)]

names(dtrain)


n <- nrow(dtrain)

iin <- sample(1:n, floor(2 * n / 3))
# iin <- ind_train[[1]]

feats <- names(dtrain)[!( names(dtrain) %in% c("parcelid", "logerror", "transactiondate", pkg_env$cat_vars) )]
# models_0 = list(xgbreg      = list( mae = list(nrounds = 600, eval_metric = "mae"),
#                                     rmse = list(nrounds = 600, eval_metric = "rmse")),
#                 rlm             = TRUE,
#                 rf              = TRUE,
#                 lasso           = TRUE
#                 )
# models_1 = list(xgbreg_log      = list(nrounds = NULL))

########################################################################
models_0 = list(xgbregmae       = list(fitfunc = fitXGB,
                                       par0 = list(nrounds = 600,
                                                   eval_metric = "mae",
                                                   type = "XGBREG")
                                  ),
                rlm             = list(fitfunc = fitRLM,
                                       par0 = NULL),
#                rf              = list(fitfunc = fitRF,
#                                       par0 = NULL),
                lasso           = list(fitfunc = fitLASSO,
                                       par0 = NULL)
)
###
models_1 = list(xgblog       = list(fitfunc = fitXGB,
                                    par0 = list(nrounds = 600,
                                                 eval_metric = NULL,
                                                 type = "XGBLOG")
                                    )
)
###########################################################################

l_cw <- doIt(dtrain[iin], sX = feats, sY = "logerror",
             models0 = models_0, models1 = models_1) # Should be an easy way to set models and also assign temporary parameters

l_fit <- doIt(dtrain[iin], sX = feats, sY = "logerror", insmp = 1,
              models0 = models_0, models1 = models_1)

# In sample stats
l_cw

# Out of sample predictions
lpred0 <- lapply(l_fit$m0, function(l)l$predf(l, dtrain[-iin, feats, with = FALSE], base_pred = 0))

predStat(y = dtrain[-iin, logerror],
         lpred = lpred0,
         modelnames = names(l_cw$basemodels$opt.wgt),
         optw = list(par = l_cw$basemodels$opt.wgt))


# Modified out of sample predictions
# Combine the modify
pred1 <- l_fit$m1$xgblog$predf(l_fit$m1$xgblog, dtrain[-iin, feats, with = FALSE])
opt_pred0 <- apply2(lpred0, function(r)sum(r * l_cw$basemodels$opt.wgt))
opt_pred0 <- ifelse((pred1 > 0.5) == (opt_pred0 > 0), opt_pred0, 0)

predStat(y = dtrain[-iin, logerror],
         lpred = list(opt_pred0),
         modelnames = "optimal_combination",
        optw = list(par = 1)
        )

# Modify then combine
pred1 <- l_fit$m1$xgblog$predf(l_fit$m1$xgblog, dtrain[-iin, feats, with = FALSE])
lpred0 <- lapply(lpred0, function(v)l_fit$m1$xgblog$editFunc(v, pred1))

predStat(y = dtrain[-iin, logerror],
         lpred = lpred0,
         modelnames = names(l_cw$basemodels$opt.wgt),
         optw = list(par = l_cw$basemodels$opt.wgt)
         )
steinarv/k1 documentation built on Oct. 19, 2017, 4:41 a.m.