inst/doc/nestedcv.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE
)
library(nestedcv)
library(pROC)

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("nestedcv")
#  library(nestedcv)

## -----------------------------------------------------------------------------
## Example binary classification problem with P >> n
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04)  # predictors
y <- factor(rbinom(150, 1, 0.5))  # binary response

## Partition data into 2/3 training set, 1/3 test set
trainSet <- caret::createDataPartition(y, p = 0.66, list = FALSE)

## t-test filter using whole test set
filt <- ttest_filter(y, x, nfilter = 100)
filx <- x[, filt]

## Train glmnet on training set only using filtered predictor matrix
library(glmnet)
fit <- cv.glmnet(filx[trainSet, ], y[trainSet], family = "binomial")

## Predict response on test set
predy <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "class")
predy <- as.vector(predy)
predyp <- predict(fit, newx = filx[-trainSet, ], s = "lambda.min", type = "response")
predyp <- as.vector(predyp)
output <- data.frame(testy = y[-trainSet], predy = predy, predyp = predyp)

## Results on test set
## shows bias since univariate filtering was applied to whole dataset
predSummary(output)

## Nested CV
fit2 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 7:10 / 10,
                      filterFUN = ttest_filter,
                      filter_options = list(nfilter = 100))
fit2

testroc <- pROC::roc(output$testy, output$predyp, direction = "<", quiet = TRUE)
inroc <- innercv_roc(fit2)
plot(fit2$roc)
lines(inroc, col = 'blue')
lines(testroc, col = 'red')
legend('bottomright', legend = c("Nested CV", "Left-out inner CV folds", 
                                 "Test partition, non-nested filtering"), 
       col = c("black", "blue", "red"), lty = 1, lwd = 2, bty = "n")

## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig1.svg")

## ----out.width='75%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("fig2.svg")

## ----eval = FALSE-------------------------------------------------------------
#  # Raw RNA-Seq data for this example is located at:
#  # https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-11611/
#  
#  # set up data
#  load("/../R4RA_270821.RData")
#  
#  index <- r4ra.meta$Outliers_Detected_On_PCA != "outlier" & r4ra.meta$Visit == 3 &
#            !is.na(r4ra.meta$Visit)
#  metadata <- r4ra.meta[index, ]
#  dim(metadata)  # 133 individuals
#  
#  medians <- Rfast::rowMedians(as.matrix(r4ra.vst))
#  data <- t(as.matrix(r4ra.vst))
#  # remove low expressed genes
#  data <- data[index, medians > 6]
#  dim(data)  # 16254 genes
#  
#  # Rituximab cohort only
#  yrtx <- metadata$CDAI.response.status.V7[metadata$Randomised.medication == "Rituximab"]
#  yrtx <- factor(yrtx)
#  data.rtx <- data[metadata$Randomised.medication == "Rituximab", ]
#  
#  # no filter
#  res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx,
#                           family = "binomial", cv.cores = 8,
#                           alphaSet = seq(0.7, 1, 0.05))
#  res.rtx

## ----eval = FALSE-------------------------------------------------------------
#  # t-test filter
#  res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, filterFUN = ttest_filter,
#                           filter_options = list(nfilter = 300, p_cutoff = NULL),
#                           family = "binomial", cv.cores = 8,
#                           alphaSet = seq(0.7, 1, 0.05))
#  summary(res.rtx)

## ----eval = FALSE-------------------------------------------------------------
#  plot_alphas(res.rtx)
#  plot_lambdas(res.rtx)

## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_alpha_lam.png")

## ----eval = FALSE-------------------------------------------------------------
#  # Fold 1 line plot
#  plot(res.rtx$outer_result[[1]]$cvafit)
#  
#  # Scatter plot
#  plot(res.rtx$outer_result[[1]]$cvafit, type = 'p')
#  
#  # Number of non-zero coefficients
#  plot(res.rtx$outer_result[[1]]$cvafit, xaxis = 'nvar')

## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("plot_cva.png")

## ----eval = FALSE-------------------------------------------------------------
#  # Outer CV ROC
#  plot(res.rtx$roc, main = "Outer fold ROC", font.main = 1, col = 'blue')
#  legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(res.rtx$roc), 3)), bty = 'n')
#  
#  # Inner CV ROC
#  rtx.inroc <- innercv_roc(res.rtx)
#  plot(rtx.inroc, main = "Inner fold ROC", font.main = 1, col = 'red')
#  legend("bottomright", legend = paste0("AUC = ", signif(pROC::auc(rtx.inroc), 3)), bty = 'n')

## ----out.width='100%', fig.align="center", echo=FALSE-------------------------
knitr::include_graphics("roc.png")

## ----eval = FALSE-------------------------------------------------------------
#  boxplot_model(res.rtx, ylab = "VST")

## ----out.width='70%', fig.align="center", echo=FALSE--------------------------
knitr::include_graphics("boxplot.png")

## ----eval = FALSE-------------------------------------------------------------
#  # Outer LOOCV
#  res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = ttest_filter,
#                           filter_options = list(nfilter = 300, p_cutoff = NULL),
#                           outer_method = "LOOCV",
#                           family = "binomial", cv.cores = 8,
#                           alphaSet = seq(0.7, 1, 0.05))
#  summary(res.rtx)

## ----eval = FALSE-------------------------------------------------------------
#  # Random forest filter
#  res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0.5, filterFUN = rf_filter,
#                           filter_options = list(nfilter = 300),
#                           family = "binomial", cv.cores = 8,
#                           alphaSet = seq(0.7, 1, 0.05))
#  summary(res.rtx)
#  
#  # ReliefF algorithm filter
#  res.rtx <- nestcv.glmnet(y = yrtx, x = data.rtx, min_1se = 0, filterFUN = relieff_filter,
#                           filter_options = list(nfilter = 300),
#                           family = "binomial", cv.cores = 8,
#                           alphaSet = seq(0.7, 1, 0.05))
#  summary(res.rtx)

## -----------------------------------------------------------------------------
library(mlbench)
data(BostonHousing2)
dat <- BostonHousing2
y <- dat$cmedv  ## continuous outcome
x <- subset(dat, select = -c(cmedv, medv, town))

stat_filter(y, x, type = "full")

## ----eval = FALSE-------------------------------------------------------------
#  filter <- function(y, x, ...) {}

## -----------------------------------------------------------------------------
## Imbalanced dataset
set.seed(1, "L'Ecuyer-CMRG")
x <- matrix(rnorm(150 * 2e+04), 150, 2e+04)  # predictors
y <- factor(rbinom(150, 1, 0.2))  # imbalanced binary response
table(y)

## first 30 parameters are weak predictors
x[, 1:30] <- rnorm(150 * 30, 0, 1) + as.numeric(y)*0.7

## -----------------------------------------------------------------------------
out <- randomsample(y, x)
y2 <- out$y
x2 <- out$x
table(y2)

## Nested CV glmnet with unnested balancing by random oversampling on
## whole dataset
fit1 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
                      n_outer_folds = 4,
                      cv.cores=2,
                      filterFUN = ttest_filter)
fit1$summary

## -----------------------------------------------------------------------------
out <- randomsample(y, x, minor=1, major=0.4)
y2 <- out$y
x2 <- out$x
table(y2)

## Nested CV glmnet with unnested balancing by random undersampling on
## whole dataset
fit1b <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
                       n_outer_folds = 4,
                       cv.cores=2,
                       filterFUN = ttest_filter)
fit1b$summary

## Balance x & y outside of CV loop by SMOTE
out <- smote(y, x)
y2 <- out$y
x2 <- out$x
table(y2)

## Nested CV glmnet with unnested balancing by SMOTE on whole dataset
fit2 <- nestcv.glmnet(y2, x2, family = "binomial", alphaSet = 1,
                      n_outer_folds = 4,
                      cv.cores=2,
                      filterFUN = ttest_filter)
fit2$summary

## -----------------------------------------------------------------------------
## Nested CV glmnet with nested balancing by random oversampling
fit3 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
                      n_outer_folds = 4,
                      cv.cores=2,
                      balance = "randomsample",
                      filterFUN = ttest_filter)
fit3$summary

## -----------------------------------------------------------------------------
## Nested CV glmnet with weights
w <- weight(y)
table(w)

fit4 <- nestcv.glmnet(y, x, family = "binomial", alphaSet = 1,
                      n_outer_folds = 4,
                      cv.cores=2,
                      weights = w,
                      filterFUN = ttest_filter)
fit4$summary

## -----------------------------------------------------------------------------
plot(fit1$roc, col='green')
lines(fit1b$roc, col='red')
lines(fit2$roc, col='blue')
lines(fit3$roc)
lines(fit4$roc, col='purple')
legend('bottomright', legend = c("Unnested random oversampling", 
                                 "Unnested SMOTE",
                                 "Unnested random undersampling",
                                 "Nested random oversampling",
                                 "Nested glmnet with weights"), 
       col = c("green", "blue", "red", "black", "purple"), lty = 1, lwd = 2, bty = "n", cex=0.8)

## ----eval = FALSE-------------------------------------------------------------
#  balance <- function(y, x, ...) {
#  
#    return(list(y = y, x = x))
#  }

## ----eval = FALSE-------------------------------------------------------------
#  # nested CV using caret
#  tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 100)),
#                    alpha = seq(0.8, 1, 0.1))
#  ncv <- nestcv.train(y = yrtx, x = data.rtx,
#                 method = "glmnet",
#                 savePredictions = "final",
#                 filterFUN = ttest_filter, filter_options = list(nfilter = 300),
#                 tuneGrid = tg, cv.cores = 8)
#  ncv$summary
#  
#  # Plot ROC on outer folds
#  plot(ncv$roc)
#  
#  # Plot ROC on inner LO folds
#  inroc <- innercv_roc(ncv)
#  plot(inroc)
#  pROC::auc(inroc)
#  
#  # Extract coefficients of final fitted model
#  glmnet_coefs(ncv$final_fit$finalModel, s = ncv$finalTune$lambda)

## ----eval = FALSE-------------------------------------------------------------
#  # Example tuning plot for outer fold 1
#  plot(ncv$outer_result[[1]]$fit, xTrans = log)
#  
#  # ggplot2 version
#  ggplot(ncv$outer_result[[1]]$fit) +
#    scale_x_log10()

## ----eval = FALSE-------------------------------------------------------------
#  data(iris)
#  y <- iris$Species
#  x <- iris[, -5]
#  
#  out_folds <- caret::createFolds(y, k = 8)
#  in_folds <- lapply(out_folds, function(i) {
#    train_y <- y[-i]
#    caret::createFolds(train_y, k = 8)
#  })
#  
#  res <- nestcv.train(y, x, method = "rf",
#                      cv.cores = 8,
#                      inner_folds = in_folds,
#                      outer_folds = out_folds)
#  summary(res)
#  res$outer_folds  # show which outer fold indices were used

## ----eval = FALSE-------------------------------------------------------------
#  # for nestcv.glmnet object
#  preds <- predict(res.rtx, newdata = data.rtx, type = 'response')
#  
#  # for nestcv.train object
#  preds <- predict(ncv, newdata = data.rtx)

## ----eval = FALSE-------------------------------------------------------------
#  parallel::detectCores(logical = FALSE)

## ----eval = FALSE-------------------------------------------------------------
#  set.seed(123, "L'Ecuyer-CMRG")

Try the nestedcv package in your browser

Any scripts or data that you put into this service are public.

nestedcv documentation built on Oct. 26, 2023, 5:08 p.m.