This package provides functions for generating all possible splits of variables into groups, and computing the best split selection regression estimator for low-dimensional data.
You can install the stable version on R CRAN.
install.packages("splitSelect", dependencies = TRUE)
You can install the development version from GitHub.
library(devtools) devtools::install_github("AnthonyChristidis/splitSelect")
Here is some code to generate all possible splits of variables into groups.
# Loading library library(splitSelect) # Setting number of variables and groups p <- 8 G <- 4 use.all <- TRUE # Generate the number of partitions my.partitions <- generate_partitions(p, G, use.all=use.all) my.partitions # Generate the number of splits nsplit(p, G, use.all=use.all) # Generate the number of splits (fixed partition) nsplit(p, G, use.all=use.all, fix.partition=matrix(c(2,2,2,2), nrow=1)) # Generate the splits all.splits <- generate_splits(p, G, use.all=use.all) head(all.splits) nrow(all.splits) # Generate the splits (fixed partition) all.splits <- generate_splits(p, G, use.all=use.all, fix.partition=matrix(c(2,2,2,2), nrow=1)) head(all.splits) nrow(all.splits) # Generate samples of splits sample.splits <- rsplit(10000, p, G, fix.partition=matrix(c(2,2,2,2), nrow=1)) sample.splits
Here is some code to apply to compute the best split selection estimator for simulated data with spurious correlation in the training set.
# Download the packages install.packages("simTargetCov") install.packages("glmnet") install.packages("SplitReg") # Setting the parameters p <- 6 n <- 30 n.test <- 5000 group.beta <- 5 beta <- c(rep(1, 2), rep(group.beta, p-2)) rho <- 0.1 r <- 0.9 SNR <- 3 # Creating the target matrix with "kernel" set to rho target_cor <- function(r, p){ Gamma <- diag(p) for(i in 1:(p-1)){ for(j in (i+1):p){ Gamma[i,j] <- Gamma[j,i] <- r^(abs(i-j)) } } return(Gamma) } # AR Correlation Structure Sigma.r <- target_cor(r, p) Sigma.rho <- target_cor(rho, p) sigma.epsilon <- as.numeric(sqrt((t(beta) %*% Sigma.rho %*% beta)/SNR)) # Number of cores nb.cores <- parallel::detectCores()-1 # Registering the clusters cl <- parallel::makeCluster(nb.cores) doParallel::registerDoParallel(cl) # Set the seed set.seed(0) # Simulate some data x.train <- simTargetCov::simTargetCov(n=n, p=p, target=Sigma.r) y.train <- 1 + x.train %*% beta + rnorm(n=n, mean=0, sd=sigma.epsilon) x.test <- mvnfast::rmvn(n.test, mu=rep(0,p), sigma=Sigma.rho) y.test <- 1 + x.test %*% beta + rnorm(n.test, sd=sigma.epsilon) # Best Split Selection for Regression system.time( split.out <- cv.splitSelect(x.train, y.train, G=2, use.all=TRUE, fix.partition=list(matrix(c(2,4, 3,3), ncol=2, byrow=TRUE)), fix.split=NULL, intercept=TRUE, group.model="glmnet", alpha=0, nfolds=10, parallel=TRUE, cores=nb.cores) ) split.predictions <- predict(split.out, newx=x.test) mean((split.predictions-y.test)^2)/sigma.epsilon^2 # Ending the cluster parallel::stopCluster(cl) # Ridge Regression cv.ridge <- glmnet::cv.glmnet(x.train, y.train, alpha=0) ridge <- glmnet::glmnet(x.train, y.train, alpha=0, lambda=cv.ridge$lambda.min) ridge.predictions <- predict(ridge, newx=x.test) mean((ridge.predictions-y.test)^2)/sigma.epsilon^2 # Lasso cv.lasso <- glmnet::cv.glmnet(x.train, y.train, alpha=1) lasso <- glmnet::glmnet(x.train, y.train, alpha=1, lambda=cv.lasso$lambda.min) lasso.predictions <- predict(lasso, newx=x.test) mean((lasso.predictions-y.test)^2)/sigma.epsilon^2 # Elastic Net cv.elastic <- glmnet::cv.glmnet(x.train, y.train, alpha=3/4) elastic <- glmnet::glmnet(x.train, y.train, alpha=3/4, lambda=cv.elastic$lambda.min) elastic.predictions <- predict(elastic, newx=x.test) mean((elastic.predictions-y.test)^2)/sigma.epsilon^2 # SplitReg cv.splitreg <- SplitReg::cv.SplitReg(x.train, y.train, num_models=3, alpha=1e-2) splitreg.predictions <- predict(cv.splitreg, newx=x.test) mean((splitreg.predictions-y.test)^2)/sigma.epsilon^2 # Looking at the MSPEs for all the possible splits (out-of-sample) split.mspes <- sapply(1:nrow(split.out$splits), function(x, n.test, x.test, split.out, y.test) {mean((y.test-cbind(rep(1, n.test), x.test) %*% split.out$betas[,x])^2)}, n.test, x.test, split.out, y.test)/sigma.epsilon^2 # Minimum MSPE for the splits (out-of-sample) min(split.mspes) split.mspes[split.out$optimal.split] # Optimal splits comparison (out-of-sample) split.out$splits[which.min(split.mspes),] split.out$optimal.split.var # Optimal betas comparison (out-of-sample) split.out$betas[,which.min(split.mspes), drop=FALSE] coef(split.out)
This package is free and open source software, licensed under GPL (>= 2).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.