Nothing
## ---- eval = F, echo = T------------------------------------------------------
#
# library(RGF)
#
# # conversion from a matrix object to a scipy sparse matrix
# #----------------------------------------------------------
#
# set.seed(1)
#
# x = matrix(runif(1000), nrow = 100, ncol = 10)
#
# x_sparse = mat_2scipy_sparse(x, format = "sparse_row_matrix")
#
# print(dim(x))
#
# [1] 100 10
#
# print(x_sparse$shape)
#
# (100, 10)
#
## ---- eval = F, echo = T------------------------------------------------------
#
# # conversion from a dgCMatrix object to a scipy sparse matrix
# #-------------------------------------------------------------
#
# data = c(1, 0, 2, 0, 0, 3, 4, 5, 6)
#
#
# # 'dgCMatrix' sparse matrix
# #--------------------------
#
# dgcM = Matrix::Matrix(data = data, nrow = 3,
#
# ncol = 3, byrow = TRUE,
#
# sparse = TRUE)
#
# print(dim(dgcM))
#
# [1] 3 3
#
# x_sparse = TO_scipy_sparse(dgcM)
#
# print(x_sparse$shape)
#
# (3, 3)
#
#
# # 'dgRMatrix' sparse matrix
# #--------------------------
#
# dgrM = as(dgcM, "RsparseMatrix")
#
# class(dgrM)
#
# # [1] "dgRMatrix"
# # attr(,"package")
# # [1] "Matrix"
#
# print(dim(dgrM))
#
# [1] 3 3
#
# res_dgr = TO_scipy_sparse(dgrM)
#
# print(res_dgr$shape)
#
# (3, 3)
#
## ---- eval = F, echo = T------------------------------------------------------
#
# data(Boston, package = 'KernelKnn')
#
# library(RGF)
# library(ranger)
# library(xgboost)
#
#
#
# # shuffling function for cross-validation folds
# #-----------------------------------------------
#
#
# func_shuffle = function(vec, times = 10) {
#
# for (i in 1:times) {
#
# out = sample(vec, length(vec))
# }
# out
# }
#
#
# # cross-validation folds [ regression]
# #-------------------------------------
#
#
# regr_folds = function(folds, RESP, stratified = FALSE) {
#
# if (is.factor(RESP)) {
#
# stop(simpleError("This function is meant for regression.
# For classification use the 'class_folds' function."))
# }
#
# samp_vec = rep(1/folds, folds)
#
# sort_names = paste0('fold_', 1:folds)
#
#
# if (stratified == TRUE) {
#
# stratif = cut(RESP, breaks = folds)
#
# clas = lapply(unique(stratif), function(x) which(stratif == x))
#
# len = lapply(clas, function(x) length(x))
#
# prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x)
# round(y * samp_vec[x])))
#
# repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y)
# rep(paste0('fold_', y), x[y]))))
#
# spl = suppressWarnings(split(1:length(RESP), repl))}
#
# else {
#
# prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec),
# function(x) round(y * samp_vec[x])))
#
# repl = func_shuffle(unlist(lapply(prop, function(x)
# sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))))
#
# spl = suppressWarnings(split(1:length(RESP), repl))
# }
#
# spl = spl[sort_names]
#
# if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) {
#
# warning('the folds are not equally split')
# }
#
# if (length(unlist(spl)) != length(RESP)) {
#
# stop(simpleError("the length of the splits are not equal with the length
# of the response"))
# }
#
# spl
# }
## ---- eval = F, echo = T------------------------------------------------------
#
# NUM_FOLDS = 5
#
# set.seed(1)
# FOLDS = regr_folds(folds = NUM_FOLDS, Boston[, 'medv'], stratified = T)
#
#
# boston_rgf_te = boston_ranger_te = boston_xgb_te = boston_rgf_time =
# boston_ranger_time = boston_xgb_time = rep(NA, NUM_FOLDS)
#
#
# for (i in 1:length(FOLDS)) {
#
# cat("fold : ", i, "\n")
#
# samp = unlist(FOLDS[-i])
# samp_ = unlist(FOLDS[i])
#
#
# # RGF
# #----
#
# rgf_start = Sys.time()
#
# init_regr = RGF_Regressor$new(l2 = 0.1)
#
# init_regr$fit(x = as.matrix(Boston[samp, -ncol(Boston)]), y = Boston[samp, ncol(Boston)])
#
# pr_te = init_regr$predict(as.matrix(Boston[samp_, -ncol(Boston)]))
#
# rgf_end = Sys.time()
#
# boston_rgf_time[i] = rgf_end - rgf_start
#
# boston_rgf_te[i] = MLmetrics::RMSE(Boston[samp_, 'medv'], pr_te)
#
#
# # ranger
# #-------
#
# ranger_start = Sys.time()
#
# fit = ranger(dependent.variable.name = "medv", data = Boston[samp, ], write.forest = TRUE,
#
# probability = F, num.threads = 1, num.trees = 500, verbose = T,
#
# classification = F, mtry = NULL, min.node.size = 5, keep.inbag = T)
#
# pred_te = predict(fit, data = Boston[samp_, -ncol(Boston)], type = 'se')$predictions
#
# ranger_end = Sys.time()
#
# boston_ranger_time[i] = ranger_end - ranger_start
#
# boston_ranger_te[i] = MLmetrics::RMSE(Boston[samp_, 'medv'], pred_te)
#
#
# # xgboost
# #--------
#
# xgb_start = Sys.time()
#
# dtrain <- xgb.DMatrix(data = as.matrix(Boston[samp, -ncol(Boston)]),
#
# label = Boston[samp, ncol(Boston)])
#
# dtest <- xgb.DMatrix(data = as.matrix(Boston[samp_, -ncol(Boston)]),
#
# label = Boston[samp_, ncol(Boston)])
#
#
# watchlist <- list(train = dtrain, test = dtest)
#
#
# param = list("objective" = "reg:linear", "bst:eta" = 0.05, "max_depth" = 4,
#
# "subsample" = 0.85, "colsample_bytree" = 0.85, "booster" = "gbtree",
#
# "nthread" = 1)
#
# fit = xgb.train(param, dtrain, nround = 500, print_every_n = 100, watchlist = watchlist,
#
# early_stopping_rounds = 20, maximize = FALSE, verbose = 0)
#
# p_te = xgboost:::predict.xgb.Booster(fit, as.matrix(Boston[samp_, -ncol(Boston)]),
#
# ntreelimit = fit$best_iteration)
#
# xgb_end = Sys.time()
#
# boston_xgb_time[i] = xgb_end - xgb_start
#
# boston_xgb_te[i] = MLmetrics::RMSE(Boston[samp_, 'medv'], p_te)
# }
#
#
# fold : 1
# fold : 2
# fold : 3
# fold : 4
# fold : 5
#
#
#
# cat("total time rgf 5 fold cross-validation : ", sum(boston_rgf_time),
# " mean rmse on test data : ", mean(boston_rgf_te), "\n")
#
# cat("total time ranger 5 fold cross-validation : ", sum(boston_ranger_time),
# " mean rmse on test data : ", mean(boston_ranger_te), "\n")
#
# cat("total time xgb 5 fold cross-validation : ", sum(boston_xgb_time),
# " mean rmse on test data : ", mean(boston_xgb_te), "\n")
#
## ---- eval = F, echo = T------------------------------------------------------
#
# total time rgf 5 fold cross-validation : 0.7730639 mean rmse on test data : 3.832135
# total time ranger 5 fold cross-validation : 3.826846 mean rmse on test data : 4.17419
# total time xgb 5 fold cross-validation : 0.4316094 mean rmse on test data : 3.949122
#
## ---- eval = F, echo = T------------------------------------------------------
#
# # download the data from the following GitHub repository (tested on a Linux OS)
#
# system("wget
# https://raw.githubusercontent.com/mlampros/DataSets/master/africa_soil_train_data.zip")
#
#
# # load the data in the R session
#
# train_dat = read.table(unz("africa_soil_train_data.zip", "train.csv"), nrows = 1157,
#
# header = T, quote = "\"", sep = ",")
#
#
# # c("Ca", "P", "pH", "SOC", "Sand") : response variables
#
#
# # exclude response-variables and factor variable
#
# x = train_dat[, -c(1, which(colnames(train_dat) %in%
# c("Ca", "P", "pH", "SOC", "Sand", "Depth")))]
#
#
# # take (randomly) the first of the responses for train
#
# y = train_dat[, "Ca"]
#
#
# # dataset for ranger
#
# tmp_rg_dat = cbind(Ca = y, x)
#
#
# # cross-validation folds
#
# set.seed(2)
# FOLDS = regr_folds(folds = NUM_FOLDS, y, stratified = T)
#
#
# highdim_rgf_te = highdim_ranger_te = highdim_xgb_te = highdim_rgf_time =
# highdim_ranger_time = highdim_xgb_time = rep(NA, NUM_FOLDS)
#
#
# for (i in 1:length(FOLDS)) {
#
# cat("fold : ", i, "\n")
#
# new_samp = unlist(FOLDS[-i])
# new_samp_ = unlist(FOLDS[i])
#
#
# # RGF
# #----
#
# rgf_start = Sys.time()
#
# init_regr = FastRGF_Regressor$new(n_jobs = 5, l2 = 0.1) # added 'l2' regularization
#
# init_regr$fit(x = as.matrix(x[new_samp, ]), y = y[new_samp])
#
# pr_te = init_regr$predict(as.matrix(x[new_samp_, ]))
#
# rgf_end = Sys.time()
#
# highdim_rgf_time[i] = rgf_end - rgf_start
#
# highdim_rgf_te[i] = MLmetrics::RMSE(y[new_samp_], pr_te)
#
#
# # ranger
# #-------
#
# ranger_start = Sys.time()
#
#
# fit = ranger(dependent.variable.name = "Ca", data = tmp_rg_dat[new_samp, ],
#
# write.forest = TRUE, probability = F, num.threads = 5, num.trees = 500,
#
# verbose = T, classification = F, mtry = NULL, min.node.size = 5,
#
# keep.inbag = T)
#
#
# pred_te = predict(fit, data = x[new_samp_, ], type = 'se')$predictions
#
# ranger_end = Sys.time()
#
# highdim_ranger_time[i] = ranger_end - ranger_start
#
# highdim_ranger_te[i] = MLmetrics::RMSE(y[new_samp_], pred_te)
#
#
# # xgboost
# #--------
#
# xgb_start = Sys.time()
#
# dtrain <- xgb.DMatrix(data = as.matrix(x[new_samp, ]), label = y[new_samp])
#
# dtest <- xgb.DMatrix(data = as.matrix(x[new_samp_, ]), label = y[new_samp_])
#
# watchlist <- list(train = dtrain, test = dtest)
#
# param = list("objective" = "reg:linear", "bst:eta" = 0.05, "max_depth" = 6,
#
# "subsample" = 0.85, "colsample_bytree" = 0.85, "booster" = "gbtree",
#
# "nthread" = 5) # "lambda" = 0.1 does not improve RMSE
#
# fit = xgb.train(param, dtrain, nround = 500, print_every_n = 100, watchlist = watchlist,
#
# early_stopping_rounds = 20, maximize = FALSE, verbose = 0)
#
# p_te = xgboost:::predict.xgb.Booster(fit, as.matrix(x[new_samp_, ]),
# ntreelimit = fit$best_iteration)
#
# xgb_end = Sys.time()
#
# highdim_xgb_time[i] = xgb_end - xgb_start
#
# highdim_xgb_te[i] = MLmetrics::RMSE(y[new_samp_], p_te)
# }
#
#
# fold : 1
# fold : 2
# fold : 3
# fold : 4
# fold : 5
#
#
# cat("total time rgf 5 fold cross-validation : ", sum(highdim_rgf_time),
# " mean rmse on test data : ", mean(highdim_rgf_te), "\n")
#
# cat("total time ranger 5 fold cross-validation : ", sum(highdim_ranger_time),
# " mean rmse on test data : ", mean(highdim_ranger_te), "\n")
#
# cat("total time xgb 5 fold cross-validation : ", sum(highdim_xgb_time),
# " mean rmse on test data : ", mean(highdim_xgb_te), "\n")
#
## ---- eval = F, echo = T------------------------------------------------------
#
# total time rgf 5 fold cross-validation : 92.31971 mean rmse on test data : 0.5155166
# total time ranger 5 fold cross-validation : 27.32866 mean rmse on test data : 0.5394164
# total time xgb 5 fold cross-validation : 30.48834 mean rmse on test data : 0.5453544
#
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.