R/cw_funcs.R

Defines functions cw_xgb_par cw_par

# Crossvalidate parameters of different models
cw_par <- function(TD, sX, sY, parpath, newpars, models_0, models_1, seed1 = 2, seed2 = 12) {

  set.seed(seed1)
  iin <- sample(1:nrow(TD), floor(nrow(TD) * 0.5))

  for(newpar in newpars){

    models_0[[parpath[1]]][[parpath[2]]][[parpath[3]]] <- newpar



    l_cw <- doIt(TD[iin], sX = sX, sY = sY,
                 models0 = models_0, models1 = models_1, seed = seed2,
                 incl_outsmp = TRUE)

    print(paste0(paste(parpath, collapse = " --> "),
                 " = ", newpar, " ::: percentage improvement: ",
                 round(l_cw$basemodels$opt.prc, 5)))
  }
}


# cross validate inclusion and possible exclusion of certain prameters
# inclusion happens through a make_feat_func which must return name of new feats
cw_xgb_par <- function(TD, sX, sY, models_0, models_1, seed1 = 2, seed2 = 12,
                       min_gains = NULL, n_best = NULL, reintroduce = TRUE,
                       init_nrounds = 1000, make_feat_func = NULL) {

  if(!is.null(make_feat_func)){
    nf <- make_feat_func(TD)
    sX <- c(sX, nf)
  }

  set.seed(seed1)
  iin <- sample(1:nrow(TD), floor(nrow(TD) * 0.5))

  # use -iin to get informatin regarding variable importance
  models_0$xgbreg$mae$nrounds = init_nrounds
  fit <- doIt(TD[-iin], sX = sX, sY = sY,
              models0 = models_0, models1 = models_1, insmp = 1)

  dtimp <- xgb.importance(feature_names = sX, model = fit[[1]][[1]][[1]])
  print(dtimp[1:min(.N, n_best, 50)])


  models_0$xgbreg$mae$nrounds = NULL
  out <- list()
  # Skip features with gain beneath min_gains
  for(x in min_gains){

    if(is.na(x)){
      sX_ <- sX
    }else{
      sX_ <- dtimp[Gain > x, Feature]
    }

    l_cw <- doIt(TD[iin], sX = sX_, sY = sY,
                 models0 = models_0, models1 = models_1, seed = seed2,
                 incl_outsmp = TRUE)

    print(paste0("Minimum gain = ", x, " ::: percentage improvement: ",
                 round(l_cw$basemodels$opt.prc, 5)))

    out[[ length(out) + 1 ]] <- list(drop = x, scr = round(l_cw$basemodels$opt.prc, 5))
  }

  # Try dropping "n_best" features one of the time
  if(!is.null(n_best))for(i in 0:n_best){
    if(i == 0){
      if(reintroduce){ next }
      drop_feat <- NA
      drop_featZ <- NULL
      sX_ <- sX
    }else{
      drop_feat <- dtimp[i, Feature]
      sX_ <- sX[-which(sX == drop_feat)]
    }

    cat("##############  Try dropping", drop_feat, "#####################\n")

    l_cw <- doIt(TD[iin], sX = sX_, sY = sY,
                 models0 = models_0, models1 = models_1, seed = seed2,
                 incl_outsmp = TRUE)

    cat("Dropping ", drop_feat, " ::: percentage improvement: ",
                 round(l_cw$basemodels$opt.prc, 5))

    out[[ length(out) + 1 ]] <- list(drop = drop_feat, scr = round(l_cw$basemodels$opt.prc, 5))

    if(i == 0){
      best_scr <- l_cw$basemodels$opt.prc
      cat(" ---> base score set!")
    }else{
      if(reintroduce){ next }
      ddiff <- l_cw$basemodels$opt.prc - best_scr
      if(ddiff > 0){
        cat(" ---> improvement from best score:", ddiff)
        sX <- sX_
        drop_featZ <- c(drop_featZ, drop_feat)
        best_scr <- l_cw$basemodels$opt.prc
      }
    }
    cat("\n#######################################################################\n")
  }

  list(table = do.call(rbind, out), if(!reintroduce)drop_featZ = drop_featZ)
}
steinarv/k1 documentation built on Oct. 19, 2017, 4:41 a.m.