Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
warning = FALSE
)
library(nestedcv)
library(pROC)
## ----eval = FALSE-------------------------------------------------------------
# install.packages("nestedcv")
# library(nestedcv)
## -----------------------------------------------------------------------------
## Example binary classification problem with P >> n
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04) # predictors
y <- factor(rbinom(150, 1, 0.5)) # binary response
## Partition data into 2/3 training set, 1/3 test set
trainSet <- caret::createDataPartition(y, p = 0.66, list = FALSE)
## t-test filter using whole test set
filt <- ttest_filter(y, x, nfilter = 100)
filx <- x[, filt]
## Train glmnet on training set only using filtered predictor matrix
library(glmnet)
fit <- cv.glmnet(filx[trainSet, ], y[trainSet], family = "binomial")
## Predict response on test set
predy <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "class")
predy <- as.vector(predy)
predyp <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "response")
predyp <- as.vector(predyp)
output <- data.frame(testy = y[-trainSet], predy = predy, predyp = predyp)
## Results on test set
## shows bias since univariate filtering was applied to whole dataset
predSummary(output)
## Nested CV
fit2 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 7:10 / 10,
filterFUN = ttest_filter,
filter_options = list(nfilter = 100))
fit2
testroc <- pROC::roc(output$testy, output$predyp, direction = "<", quiet = TRUE)
inroc <- innercv_roc(fit2)
plot(fit2$roc)
lines(inroc, col = 'blue')
lines(testroc, col = 'red')
legend('bottomright', legend = c("Nested CV", "Left-out inner CV folds",
"Test partition, non-nested filtering"),
col = c("black", "blue", "red"), lty = 1, lwd = 2, bty = "n")
## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig1.svg")
## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig2.svg")
## ----eval = FALSE-------------------------------------------------------------
# # Raw RNA-Seq data for this example is located at:
# # https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-11611/
#
# # set up data
# load("/../R4RA_270821.RData")
#
# index <- r4ra.meta$Outliers_Detected_On_PCA != "outlier" & r4ra.meta$Visit == 3 &
# !is.na(r4ra.meta$Visit)
# metadata <- r4ra.meta[index, ]
# dim(metadata) # 133 individuals
#
# medians <- Rfast::rowMedians(as.matrix(r4ra.vst))
# data <- t(as.matrix(r4ra.vst))
# # remove low expressed genes
# data <- data[index, medians > 6]
# dim(data) # 16254 genes
#
# # Rituximab cohort only
# yrtx <- metadata$CDAI.response.status.V7[metadata$Randomised.medication == "Rituximab"]
# yrtx <- factor(yrtx)
# data.rtx <- data[metadata$Randomised.medication == "Rituximab", ]
#
# # no filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx,
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# res.rtx
## ----eval = FALSE-------------------------------------------------------------
# # t-test filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, filterFUN = ttest_filter,
# filter_options = list(nfilter = 300, p_cutoff = NULL),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## ----eval = FALSE-------------------------------------------------------------
# plot_alphas(res.rtx)
# plot_lambdas(res.rtx)
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_alpha_lam.png")
## ----eval = FALSE-------------------------------------------------------------
# # Fold 1 line plot
# plot(res.rtx$outer_result[[1]]$cvafit)
#
# # Scatter plot
# plot(res.rtx$outer_result[[1]]$cvafit, type = 'p')
#
# # Number of non-zero coefficients
# plot(res.rtx$outer_result[[1]]$cvafit, xaxis = 'nvar')
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_cva.png")
## ----eval = FALSE-------------------------------------------------------------
# # Outer CV ROC
# plot(res.rtx$roc, main = "Outer fold ROC", font.main = 1, col = 'blue')
# legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(res.rtx$roc), 3)), bty = 'n')
#
# # Inner CV ROC
# rtx.inroc <- innercv_roc(res.rtx)
# plot(rtx.inroc, main = "Inner fold ROC", font.main = 1, col = 'red')
# legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(rtx.inroc), 3)), bty = 'n')
## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("roc.png")
## ----eval = FALSE-------------------------------------------------------------
# # Outer LOOCV
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = ttest_filter,
# filter_options = list(nfilter = 300, p_cutoff = NULL),
# outer_method = "LOOCV",
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## ----eval = FALSE-------------------------------------------------------------
# # Random forest filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0.5, filterFUN = rf_filter,
# filter_options = list(nfilter = 300),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
#
# # ReliefF algorithm filter
# res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = relieff_filter,
# filter_options = list(nfilter = 300),
# family = "binomial", cv.cores = 8,
# alphaSet = seq(0.7, 1, 0.05))
# summary(res.rtx)
## -----------------------------------------------------------------------------
library(mlbench)
data(BostonHousing2)
dat <- BostonHousing2
y <- dat$cmedv ## continuous outcome
x <- subset(dat, select = -c(cmedv, medv, town))
stat_filter(y, x, type = "full")
## ----eval = FALSE-------------------------------------------------------------
# filter <- function(y, x, ...) {}
## ----eval=FALSE---------------------------------------------------------------
# # this example requires the missRanger package
# library(missRanger)
#
# x_na <- generateNA(x) # insert NA into x
# x_na <- as.matrix(x_na)
#
# # missRanger requires a dataframe, whereas glmnet requires a matrix
# impute_x <- function(x, ...) {
# missRanger(as.data.frame(x), num.trees = 50, ...)
# }
#
# res <- nestcv.glmnet(y, x_na, family = "gaussian",
# alphaSet = 1,
# n_outer_folds = 3, cv.cores = 2,
# modifyX = impute_x,
# na.option = "pass")
## ----eval=FALSE---------------------------------------------------------------
# # receives training data from `x` and `y` only
# # returns object with class 'modxy'
# modxy <- function(y, x) {
# sc <- scale(x)
# cen <- attr(sc, "scaled:center")
# sca <- attr(sc, "scaled:scale")
# out <- cbind(cen, sca)
# class(out) <- "modxy"
# out
# }
#
# # define predict function for class 'modxy'
# # applied independently to train and test folds of `x`
# predict.modxy <- function(object, newdata, ...) {
# scale(newdata, center = object[,1], scale = object[,2])
# }
#
# res <- nestcv.glmnet(y, x, family = "gaussian", alphaSet = 1,
# n_outer_folds = 3, cv.cores = 3,
# modifyX = modxy, modifyX_useY = TRUE)
## -----------------------------------------------------------------------------
## Imbalanced dataset
set.seed(1, "L'Ecuyer-CMRG")
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04) # predictors
y <- factor(rbinom(150, 1, 0.2)) # imbalanced binary response
table(y)
## first 30 parameters are weak predictors
x[, 1:30] <- rnorm(150 * 30, 0, 1) + as.numeric(y)*0.7
## -----------------------------------------------------------------------------
out <- randomsample(y, x)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by random oversampling on
## whole dataset
fit1 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit1$summary
## -----------------------------------------------------------------------------
out <- randomsample(y, x, minor=1, major=0.4)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by random undersampling on
## whole dataset
fit1b <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit1b$summary
## Balance x & y outside of CV loop by SMOTE
out <- smote(y, x)
y2 <- out$y
x2 <- out$x
table(y2)
## Nested CV glmnet with unnested balancing by SMOTE on whole dataset
fit2 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
filterFUN = ttest_filter)
fit2$summary
## -----------------------------------------------------------------------------
## Nested CV glmnet with nested balancing by random oversampling
fit3 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
balance = "randomsample",
filterFUN = ttest_filter)
fit3$summary
## -----------------------------------------------------------------------------
## Nested CV glmnet with weights
w <- weight(y)
table(w)
fit4 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4,
cv.cores=2,
weights = w,
filterFUN = ttest_filter)
fit4$summary
## ----fig.dim = c(5, 5)--------------------------------------------------------
plot(fit1$roc, col='green')
lines(fit1b$roc, col='red')
lines(fit2$roc, col='blue')
lines(fit3$roc)
lines(fit4$roc, col='purple')
legend('bottomright', legend = c("Unnested random oversampling",
"Unnested SMOTE",
"Unnested random undersampling",
"Nested random oversampling",
"Nested glmnet with weights"),
col = c("green", "blue", "red", "black", "purple"), lty = 1, lwd = 2, bty = "n", cex=0.8)
## ----eval = FALSE-------------------------------------------------------------
# balance <- function(y, x, ...) {
#
# return(list(y = y, x = x))
# }
## ----eval = FALSE-------------------------------------------------------------
# # nested CV using caret
# tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 100)),
# alpha = seq(0.8, 1, 0.1))
# ncv <- nestcv.train(y = yrtx, x = data.rtx,
# method = "glmnet",
# savePredictions = "final",
# filterFUN = ttest_filter, filter_options = list(nfilter = 300),
# tuneGrid = tg, cv.cores = 8)
# ncv$summary
#
# # Plot ROC on outer folds
# plot(ncv$roc)
#
# # Plot ROC on inner LO folds
# inroc <- innercv_roc(ncv)
# plot(inroc)
# pROC::auc(inroc)
#
# # Extract coefficients of final fitted model
# glmnet_coefs(ncv$final_fit$finalModel, s = ncv$finalTune$lambda)
## ----eval = FALSE-------------------------------------------------------------
# # Example tuning plot for outer fold 1
# plot(ncv$outer_result[[1]]$fit, xTrans = log)
#
# # ggplot2 version
# ggplot(ncv$outer_result[[1]]$fit) +
# scale_x_log10()
## ----eval = FALSE-------------------------------------------------------------
# data(iris)
# y <- iris$Species
# x <- iris[, -5]
#
# out_folds <- caret::createFolds(y, k = 8)
# in_folds <- lapply(out_folds, function(i) {
# train_y <- y[-i]
# caret::createFolds(train_y, k = 8)
# })
#
# res <- nestcv.train(y, x, method = "rf",
# cv.cores = 8,
# inner_folds = in_folds,
# outer_folds = out_folds)
# summary(res)
# res$outer_folds # show which outer fold indices were used
## -----------------------------------------------------------------------------
library(mlbench)
data(Sonar)
y <- Sonar$Class
x <- Sonar[, -61]
fit1 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4, cv.cores = 2)
metrics(fit1, extra = TRUE)
## ----fig.dim = c(10, 5)-------------------------------------------------------
fit1$prc <- prc(fit1)
# precision-recall AUC values
fit1$prc$auc
# plot ROC and PR curves
op <- par(mfrow = c(1, 2), mar = c(4, 4, 2, 2) +.1)
plot(fit1$roc, col = "red", main = "ROC", las = 1)
plot(fit1$prc, col = "red", main = "Precision-recall")
par(op)
## ----eval = FALSE-------------------------------------------------------------
# # for nestcv.glmnet object
# preds <- predict(res.rtx, newdata = data.rtx, type = 'response')
#
# # for nestcv.train object
# preds <- predict(ncv, newdata = data.rtx)
## ----results='hide'-----------------------------------------------------------
data(Sonar)
y <- Sonar$Class
x <- Sonar[, -61]
# single fit
fit <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4, cv.cores = 2)
# repeated nested CV
set.seed(123, "L'Ecuyer-CMRG")
repcv <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
n_outer_folds = 4) |>
repeatcv(8, rep.cores = 2)
## -----------------------------------------------------------------------------
repcv
summary(repcv)
## ----eval=FALSE---------------------------------------------------------------
# folds <- repeatfolds(y, repeats = 3, n_outer_folds = 4)
#
# repcv <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
# n_outer_folds = 4) |>
# repeatcv(3, repeat_folds = folds, rep.cores = 2)
# repcv
## ----fig.dim = c(10, 5)-------------------------------------------------------
op <- par(mfrow = c(1, 2), mar = c(4, 4, 2, 2) +.1)
plot(fit1$roc, col = "red", las = 1, main = "ROC") # single nested cv
lines(repcv$roc) # repeated nested cv
repcv$prc <- prc(repcv) # calculate precision-recall curve
plot(fit1$prc, col = "red", main = "Precision-recall") # single nested cv
lines(repcv$prc) # repeated nested cv
legend("topright", legend = c("single nested cv", "repeat nested cv"),
col = c("red", "black"), lwd = 2, bty = "n")
par(op)
## ----eval = FALSE-------------------------------------------------------------
# set.seed(123, "L'Ecuyer-CMRG")
## ----eval = FALSE-------------------------------------------------------------
# parallel::detectCores(logical = FALSE)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.