Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
warning = FALSE
)
library(nestedcv)
library(pROC)
## ----eval = FALSE-------------------------------------------------------------
# install.packages("nestedcv")
# library(nestedcv)
## -----------------------------------------------------------------------------
## Example binary classification problem with P >> n
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04) # predictors
y <- factor(rbinom(150, 1, 0.5)) # binary response
## Partition data into 2/3 training set, 1/3 test set
trainSet <- caret::createDataPartition(y, p = 0.66, list = FALSE)
## t-test filter using whole test set
filt <- ttest_filter(y, x, nfilter = 100)
filx <- x[, filt]
## Train glmnet on training set only using filtered predictor matrix
library(glmnet)
fit <- cv.glmnet(filx[trainSet, ], y[trainSet], family = "binomial")
## Predict response on test set
predy <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "class")
predy <- as.vector(predy)
predyp <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "response")
predyp <- as.vector(predyp)
output <- data.frame(testy = y[-trainSet], predy = predy, predyp = predyp)
## Results on test set
## shows bias since univariate filtering was applied to whole dataset
predSummary(output)
## Nested CV
fit2 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 7:10 / 10,
filterFUN = ttest_filter,
filter_options = list(nfilter = 100))
fit2
testroc <- pROC::roc(output$testy, output$predyp, direction = "<", quiet = TRUE)
inroc <- innercv_roc(fit2)
plot(fit2$roc)
lines(inroc, col = 'blue')
lines(testroc, col = 'red')
legend('bottomright', legend = c("Nested CV", "Left-out inner CV folds",
"Test partition, non-nested filtering"),
col = c("black", "blue", "red"), lty = 1, lwd = 2, bty = "n")
## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig1.svg")
## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig2.svg")
## ----eval = FALSE-------------------------------------------------------------
# # Raw RNA-Seq data for this example is located at:
# # https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-11611/
#
# # set up data
# load("/../R4RA_270821.RData")
#
# index <- r4ra.meta$Outliers_Detected_On_PCA != "outlier" & r4ra.meta$Visit == 3 &
# !is.na(r4ra.meta$Visit)
# metadata <- r4ra.meta[index, ]
# dim(metadata) # 133 individuals
#
# medians <- Rfast::rowMedians(as.matrix(r4ra.vst))
# data <- t(as.matrix(r4ra.vst))
# # remove low expressed genes
# data <- data[index, medians > 6]
# dim(data) # 16254 genes
#
# # Rituximab cohort only
# yrtx <- metadata$CDAI.response.status.V7[metadata$Randomised.medication == "Rituximab"]
# yrtx <- factor(yrtx)
# data.rtx <- data[metadata$Randomised.medication == "Rituximab", ]
#
# # no filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx,
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# res.rtx
## ----eval = FALSE-------------------------------------------------------------
# # t-test filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, filterFUN = ttest_filter,
# filter_options = list(nfilter = 300, p_cutoff = NULL),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## ----eval = FALSE-------------------------------------------------------------
# plot_alphas(res.rtx)
# plot_lambdas(res.rtx)
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_alpha_lam.png")
## ----eval = FALSE-------------------------------------------------------------
# # Fold 1 line plot
# plot(res.rtx$outer_result[[1]]$cvafit)
#
# # Scatter plot
# plot(res.rtx$outer_result[[1]]$cvafit, type = 'p')
#
# # Number of non-zero coefficients
# plot(res.rtx$outer_result[[1]]$cvafit, xaxis = 'nvar')
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_cva.png")
## ----eval = FALSE-------------------------------------------------------------
# # Outer CV ROC
# plot(res.rtx$roc, main = "Outer fold ROC", font.main = 1, col = 'blue')
# legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(res.rtx$roc), 3)), bty = 'n')
#
# # Inner CV ROC
# rtx.inroc <- innercv_roc(res.rtx)
# plot(rtx.inroc, main = "Inner fold ROC", font.main = 1, col = 'red')
# legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(rtx.inroc), 3)), bty = 'n')
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("roc.png")
## ----eval = FALSE-------------------------------------------------------------
# boxplot_model(res.rtx, ylab = "VST")
## ----out.width='70%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("boxplot.png")
## ----eval = FALSE-------------------------------------------------------------
# # Outer LOOCV
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = ttest_filter,
# filter_options = list(nfilter = 300, p_cutoff = NULL),
# outer_method = "LOOCV",
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## ----eval = FALSE-------------------------------------------------------------
# # Random forest filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0.5, filterFUN = rf_filter,
# filter_options = list(nfilter = 300),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
#
# # ReliefF algorithm filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = relieff_filter,
# filter_options = list(nfilter = 300),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## -----------------------------------------------------------------------------
library(mlbench)
data(BostonHousing2)
dat <- BostonHousing2
y <- dat$cmedv ## continuous outcome
x <- subset(dat, select = -c(cmedv, medv, town))
stat_filter(y, x, type = "full")
## ----eval = FALSE-------------------------------------------------------------
# filter <- function(y, x, ...) {}
## -----------------------------------------------------------------------------
## Imbalanced dataset
set.seed(1, "L'Ecuyer-CMRG")
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04) # predictors
y <- factor(rbinom(150, 1, 0.2)) # imbalanced binary response
table(y)
## first 30 parameters are weak predictors
x[, 1:30] <- rnorm(150 * 30, 0, 1) + as.numeric(y)*0.7
## -----------------------------------------------------------------------------
out <- randomsample(y, x)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by random oversampling on
## whole dataset
fit1 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit1$summary
## -----------------------------------------------------------------------------
out <- randomsample(y, x, minor=1, major=0.4)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by random undersampling on
## whole dataset
fit1b <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit1b$summary
## Balance x & y outside of CV loop by SMOTE
out <- smote(y, x)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by SMOTE on whole dataset
fit2 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit2$summary
## -----------------------------------------------------------------------------
## Nested CV glmnet with nested balancing by random oversampling
fit3 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
balance = "randomsample",
filterFUN = ttest_filter)
fit3$summary
## -----------------------------------------------------------------------------
## Nested CV glmnet with weights
w <- weight(y)
table(w)
fit4 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
weights = w,
filterFUN = ttest_filter)
fit4$summary
## -----------------------------------------------------------------------------
plot(fit1$roc, col='green')
lines(fit1b$roc, col='red')
lines(fit2$roc, col='blue')
lines(fit3$roc)
lines(fit4$roc, col='purple')
legend('bottomright', legend = c("Unnested random oversampling",
"Unnested SMOTE",
"Unnested random undersampling",
"Nested random oversampling",
"Nested glmnet with weights"),
col = c("green", "blue", "red", "black", "purple"), lty = 1, lwd = 2, bty = "n", cex=0.8)
## ----eval = FALSE-------------------------------------------------------------
# balance <- function(y, x, ...) {
#
# return(list(y = y, x = x))
# }
## ----eval = FALSE-------------------------------------------------------------
# # nested CV using caret
# tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 100)),
# alpha = seq(0.8, 1, 0.1))
# ncv <- nestcv.train(y = yrtx, x = data.rtx,
# method = "glmnet",
# savePredictions = "final",
# filterFUN = ttest_filter, filter_options = list(nfilter = 300),
# tuneGrid = tg, cv.cores = 8)
# ncv$summary
#
# # Plot ROC on outer folds
# plot(ncv$roc)
#
# # Plot ROC on inner LO folds
# inroc <- innercv_roc(ncv)
# plot(inroc)
# pROC::auc(inroc)
#
# # Extract coefficients of final fitted model
# glmnet_coefs(ncv$final_fit$finalModel, s = ncv$finalTune$lambda)
## ----eval = FALSE-------------------------------------------------------------
# # Example tuning plot for outer fold 1
# plot(ncv$outer_result[[1]]$fit, xTrans = log)
#
# # ggplot2 version
# ggplot(ncv$outer_result[[1]]$fit) +
# scale_x_log10()
## ----eval = FALSE-------------------------------------------------------------
# data(iris)
# y <- iris$Species
# x <- iris[, -5]
#
# out_folds <- caret::createFolds(y, k = 8)
# in_folds <- lapply(out_folds, function(i) {
# train_y <- y[-i]
# caret::createFolds(train_y, k = 8)
# })
#
# res <- nestcv.train(y, x, method = "rf",
# cv.cores = 8,
# inner_folds = in_folds,
# outer_folds = out_folds)
# summary(res)
# res$outer_folds # show which outer fold indices were used
## ----eval = FALSE-------------------------------------------------------------
# # for nestcv.glmnet object
# preds <- predict(res.rtx, newdata = data.rtx, type = 'response')
#
# # for nestcv.train object
# preds <- predict(ncv, newdata = data.rtx)
## ----eval = FALSE-------------------------------------------------------------
# parallel::detectCores(logical = FALSE)
## ----eval = FALSE-------------------------------------------------------------
# set.seed(123, "L'Ecuyer-CMRG")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.