models/files/glm_h2o.R

glm_h2o <- list(library = "h2o",
                type = c("Regression", "Classification"),
                parameters = data.frame(parameter = c('alpha', 'lambda'),
                                        class = c("numeric", "numeric"),
                                        label = c('Mixing Percentage', 'Regularization Parameter')),
                grid = function(x, y, len = NULL, search = "grid") {
                  if(search == "grid") {
                    out <- expand.grid(alpha = seq(0, 1, length = len),
                                       lambda = c(0, 10 ^ seq(-1, -4, length = len - 1)))
                  } else {
                    out <- data.frame(alpha = runif(len, min = 0, 1),
                                      lambda = 2^runif(len, min = -10, 3))
                  }
                  out
                },
                loop = NULL,
                fit = function(x, y, wts, param, lev, last, classProbs, ...) {
                  dat <- if(!is.data.frame(x)) as.data.frame(x, stringsAsFactors = TRUE) else x
                  dat$.outcome <- y
                  p <- ncol(dat)
                  frame_name <- paste0("tmp_glm_h2o_dat_",sample.int(100000, 1))
                  tmp_train_dat = h2o::as.h2o(dat, destination_frame = frame_name)
                  
                  # Assign pre specified parameters
                  args <- c(list(
                    x = colnames(x), y = ".outcome",
                    training_frame = tmp_train_dat,
                    family = if(is.factor(y)) "binomial" else "gaussian"
                  ),
                  
                  )
                  
                  search_param <- names(param)[!names(param) %in% names(args)]
                  if(length(search_param) > 0){
                    for(p in search_param){
                      args[[p]] <- param[p]
                    }
                  }
                  out <- do.call(h2o::h2o.glm, args)
                  h2o::h2o.getModel(out@model_id)
                },
                predict = function(modelFit, newdata, submodels = NULL) {
                  frame_name <- paste0("new_glm_h2o_dat_",sample.int(100000, 1))
                  newdata <- h2o::as.h2o(newdata, destination_frame = frame_name)
                  as.data.frame(predict(modelFit, newdata), stringsAsFactors = TRUE)[,1]
                },
                prob = function(modelFit, newdata, submodels = NULL) {
                  frame_name <- paste0("new_glm_h2o_dat_",sample.int(100000, 1))
                  newdata <- h2o::as.h2o(newdata, destination_frame = frame_name)
                  as.data.frame(predict(modelFit, newdata), stringsAsFactors = TRUE)[,-1]
                },
                predictors = function(object, ...) {
                  out <- as.data.frame(h2o::h2o.varimp(object), stringsAsFactors = TRUE)
                  out$names
                },
                varImp = function(object, ...) {
                  out <- as.data.frame(h2o::h2o.varimp(object), stringsAsFactors = TRUE)
                  rownames(out) <- out$names
                  all_var <- object@allparameters$x
                  if(any(!(all_var %in% out$variable))) {
                    missing <- all_var[!(all_var %in% out$variable)]
                    tmp <- data.frame(relative_importance = rep(0, length(missing)),
                                      scaled_importance = rep(0, length(missing)))
                    rownames(tmp) <- missing
                    out <- rbind(out, tmp)
                  }
                  
                  out
                },
                levels = NULL,
                tags = c("Generalized Linear Model", "Implicit Feature Selection", 
                         "L1 Regularization", "L2 Regularization", "Linear Classifier",
                         "Linear Regression", "Two Class Only"),
                sort = function(x) x[order(-x$lambda, x$alpha),],
                trim = NULL)
lijingya/MLTools documentation built on Dec. 5, 2024, 6:36 a.m.