knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
# Initialization
library(cssr)

# # Initialize parallel processing--only works on Mac or Unix
# registerDoParallel(cores = detectCores() - 1)

# Set seed for reproducibility
set.seed(23592)

Getting Started

Cluster stability selection is a feature selection method. Given data $(X, y)$, cluster stability selection selects the variables in $X$ that are useful for predicting $y$.

# Generate some data
data <- genLatentData(n = 200, # Sample size
                      p = 100, # Number of features 
                      cluster_size = 10, # Number of features in a cluster 
                      # correlated with a latent variable
                      k_unclustered = 10, # Number of unclustered features 
                      # that influence y
                      snr = 3 # Signal-to-noise ratio in the response y 
                      # generated from the data.
                      )

X <- data$X
y <- data$y

output <- cssSelect(X, y)

output$selected_feats

Cluster stability selection is designed to be particularly useful for data that include clustered features--groups of highly correlated features. The data we generated earlier contain a cluster of 10 features (specifically, the first 10 columns of $X$) that are highly correlated with each other and an unobserved variable $Z$ that is associated with $y$. We can tell cluster stability selection about this cluster using the "clusters" argument.

clus_output <- cssSelect(X, y, clusters=list("Z_cluster"=1:10))

clus_output$selected_feats

Cluster stability selection returns both a set of selected clusters (below) and all of the features contained within those clusters (as in the above).

clus_output$selected_clusts

need to finish vignette...

clusters <- list("Z_clust"=1:10, 50:55)

# Wrapper functions (easy!)
n_test <- 50
n <- 200
p <- 100
testx <- matrix(rnorm(n_test*p), nrow=n_test, ncol=p)

cssPredict(X, y, testx, clusters)

# Get a good lambda
inds <- 1:round(n/2)
lambda <- getLassoLambda(X[setdiff(1:n, inds), ], y[setdiff(1:n, inds)])

# clusters <- list(1:10, 50:55)
# clusters <- 1:10

results <- css(X=X, y=y, lambda=lambda
               , clusters=clusters
               # , clusters=list()
               # , clusters=1:10
               # , sampling.type = "SS"
               # B = 100,
               # , prop_feats_remove = .5
               , train_inds = inds
)

str(results)

w <- "sparse"
c <- 0.3

predictions <- results |> getCssPreds(testX = testx, weighting=w,
                                      cutoff=c
                                      , min_num_clusts=1
                                      , max_num_clusts=3
)
predictions

train_x <- matrix(rnorm(n_test*p), nrow=n_test, ncol=p)
train_y <- rnorm(n_test)

preds2 <- results |> getCssPreds(testX = testx, weighting=w,
                                 cutoff=c, min_num_clusts=1, max_num_clusts=3,
                                 trainX=train_x
                                 , trainY=train_y
)

preds2

selections <- results |> getCssSelections(weighting=w, cutoff=c
                                          # , min_num_clusts=1
                                          # , max_num_clusts=3
)

str(selections)

selections$selected_clusts
selections$selected_feats

# results |> print.cssr(cutoff=c, min_num_clusts=1, max_num_clusts=3)

x_design <- results |> getCssDesign(testx, weighting=w, cutoff=c, min_num_clusts=1, max_num_clusts=3)

str(x_design)


gregfaletto/cssr documentation built on March 3, 2023, 1 p.m.