eclust.R
In eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

## ----eval=FALSE, echo=TRUE-----------------------------------------------
#  install.packages("pacman")
#  pacman::p_install_gh("sahirbhatnagar/eclust")

## ----setup, message=FALSE, echo=FALSE------------------------------------
library(knitr)
library(eclust)
library(data.table)
options(scipen = 1, digits = 5)

## ---- echo=FALSE---------------------------------------------------------
pander::pander(data.frame(`function name` = grep("^r_",pacman::p_functions("eclust"), value = T)))

## ---- echo=FALSE---------------------------------------------------------
pander::pander(data.frame(`function name` = grep("^s_",pacman::p_functions("eclust"), value = T)))

## ---- echo=FALSE---------------------------------------------------------
pander::pander(data.frame(`function name` = grep("^u_",pacman::p_functions("eclust"), value = T)))

## ---- eval = TRUE--------------------------------------------------------
# load the data
data("tcgaov")
tcgaov[1:5,1:6, with = FALSE]

# use log survival as the response
Y <- log(tcgaov[["OS"]])

# specify the environment variable
E <- tcgaov[["E"]]

# specify the matrix of genes only
genes <- as.matrix(tcgaov[,-c("OS","rn","subtype","E","status"),with = FALSE])

# for this example the training set will be all subjects.
# change `p` argument to create a train and test set.
trainIndex <- drop(caret::createDataPartition(Y, p = 1, list = FALSE, times = 1))
testIndex <- trainIndex

## ------------------------------------------------------------------------
cluster_res <- r_cluster_data(data = genes,
                              response = Y,
                              exposure = E,
                              train_index = trainIndex,
                              test_index = testIndex,
                              cluster_distance = "corr",
                              eclust_distance = "diffcorr",
                              measure_distance = "euclidean",
                              clustMethod = "hclust",
                              cutMethod = "dynamic",
                              method = "average",
                              nPC = 1,
                              minimum_cluster_size = 30)

# the number of clusters determined by the similarity matrices specified
# in the cluster_distance and eclust_distance arguments. This will always be larger
# than cluster_res$clustersAll$nclusters which is based on the similarity matrix
# specified in the cluster_distance argument
cluster_res$clustersAddon$nclusters

# the number of clusters determined by the similarity matrices specified
# in the cluster_distance argument only
cluster_res$clustersAll$nclusters

# what's in the cluster_res object
names(cluster_res)

## ------------------------------------------------------------------------
# prepare data for use with earth function
avg_eclust_interaction <- r_prepare_data(data = cbind(cluster_res$clustersAddon$averageExpr, 
                                                     Y = Y[trainIndex],
                                                     E = E[trainIndex]),
                                        response = "Y", exposure = "E")

head(avg_eclust_interaction[["X"]])

## ------------------------------------------------------------------------
# install and load earth package
pacman::p_load(char = "earth")

fit_earth <- earth::earth(x = avg_eclust_interaction[["X"]], y = avg_eclust_interaction[["Y"]], 
                          pmethod = "backward", 
                          keepxy = TRUE, degree = 2, trace = 1, nk = 1000)

coef(fit_earth)

## ------------------------------------------------------------------------
u_extract_selected_earth(fit_earth)

## ------------------------------------------------------------------------
# Genes in cluster 5
cluster_res$clustersAddonMembership[cluster %in% 5]

# variable importance
earth::evimp(fit_earth)

## ------------------------------------------------------------------------
pacman::p_load(eclust)
d0 <- s_modules(n = 100, p = 1000, rho = 0, exposed = FALSE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.01,
                maxCor = 1,
                corPower = 1,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE,
                leaveOut = 1:4)

d1 <- s_modules(n = 100, p = 1000, rho = 0.9, exposed = TRUE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.4,
                maxCor = 1,
                corPower = 0.3,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE)

# get the true cluster labels
truemodule1 <- d1$setLabels
table(truemodule1)

## ------------------------------------------------------------------------
pacman::p_load(magrittr)

X <- rbind(d0$datExpr, d1$datExpr) %>%
  magrittr::set_colnames(paste0("Gene", 1:1000)) %>%
  magrittr::set_rownames(paste0("Subject",1:200))

## ---- fig.show='hold', tidy=FALSE----------------------------------------
pacman::p_load(pheatmap)
pacman::p_load(viridis)

pheatmap::pheatmap(cor(X[1:100,]),
                   show_rownames = F, 
                   show_colnames = F, 
                   color = viridis(100))

pheatmap::pheatmap(cor(X[101:200,]),
                   show_rownames = F, 
                   show_colnames = F, 
                   color = viridis(100))

## ---- eval=T-------------------------------------------------------------
betaMainEffect <- vector("double", length = 1000)
betaMainInteractions <- vector("double", length = 1000)

# the first 25 in the 3rd block are active
betaMainEffect[which(truemodule1 %in% 3)[1:50]] <- runif(50, 0.9, 1.1)

# the first 25 in the 4th block are active
betaMainEffect[which(truemodule1 %in% 4)[1:50]] <- runif(50, 0.9, 1.1)

# the interaction effects
betaMainInteractions[which(betaMainEffect!=0)] <- runif(50, 0.4, 0.6)

# the environment effect
betaE <- 2

# the total beta vector
beta <- c(betaMainEffect, betaE, betaMainInteractions)

## ------------------------------------------------------------------------
result <- s_generate_data(p = 1000, 
                          X = X,
                          beta = beta,
                          include_interaction = TRUE,
                          cluster_distance = "tom",
                          n = 200, 
                          n0 = 100,
                          eclust_distance = "difftom",
                          signal_to_noise_ratio = 1,
                          distance_method = "euclidean",
                          cluster_method = "hclust",
                          cut_method = "dynamic",
                          agglomeration_method = "average",
                          nPC = 1)
names(result)

## ---- echo=FALSE---------------------------------------------------------
pander::pander(data.frame(`function name` = c("s_pen_separate", "s_pen_clust", "s_mars_separate", "s_mars_clust"),
           `General Approach` = rep(c("SEPARATE","CLUST, ECLUST"), 2),
           `model` = c(rep(c("lasso, elasticnet, mcp, scad"), 2), rep("MARS",2))))

## ------------------------------------------------------------------------
# Provide ECLUST clustering results to the gene_groups argument
pen_res <- s_pen_clust(x_train = result[["X_train"]],
                       x_test = result[["X_test"]],
                       y_train = result[["Y_train"]],
                       y_test = result[["Y_test"]],
                       s0 = result[["S0"]],
                       gene_groups = result[["clustersAddon"]],
                       summary = "pc",
                       model = "lasso",
                       exp_family = "gaussian",
                       clust_type = "ECLUST",
                       include_interaction = TRUE)
unlist(pen_res)

## ---- echo=FALSE---------------------------------------------------------
pander::pander(data.frame(`object name` = c("tom_train_all","tom_train_diff","tom_train_e1","tom_train_e0","corr_train_all","corr_train_diff","corr_train_e1","corr_train_e0","fisherScore","corScor")))

## ------------------------------------------------------------------------
# check that the object is of class similarity
class(result$tom_train_e1)

# get clustering tree
hc <- hclust(as.dist(1 - result$tom_train_e1), method = "average")

plot(result$tom_train_e1, 
     truemodule = truemodule1, 
     cluster_rows = hc, 
     cluster_cols = hc,
     active = as.numeric(betaMainEffect!=0))

## ------------------------------------------------------------------------
class(cluster_res)

## ------------------------------------------------------------------------
plot(cluster_res, show_column_names = FALSE)

Any scripts or data that you put into this service are public.

eclust documentation built on May 1, 2019, 8:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

eclust
Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

inst/doc/eclust.R
In eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

Try the eclust package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

eclust Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

inst/doc/eclust.R In eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

Try the eclust package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

eclust
Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

inst/doc/eclust.R
In eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data