inst/doc/cross-validation_with_groupdata2.R

## ----include=FALSE------------------------------------------------------------

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align='center',
  dpi = 92,
  fig.retina = 2,
  eval = requireNamespace("lmerTest")
)
options(tibble.print_min = 4L, tibble.print_max = 4L)


## ----warning=FALSE,message=FALSE----------------------------------------------
# Attach some packages
library(groupdata2)
library(dplyr)
library(knitr) # kable()
library(lmerTest) #lmer()


# Create data frame
df <- data.frame("participant" = factor(as.integer(
                                        rep(c('1','2', '3', '4', '5', 
                                        '6', '7', '8', '9', '10'), 3))),
                "age" = rep(c(20,23,27,21,32,31,43,21,34,32), 3),
                "diagnosis" = factor(rep(c('a', 'b', 'a', 'b', 'b', 
                                           'a', 'a', 'a', 'b', 'b'), 3)),
                "score" = c(10,24,15,35,24,14,11,16,33,29,  # for 1st session
                            24,40,30,50,54,25,35,32,53,55,  # for 2nd session
                            45,67,40,78,62,30,41,44,66,81)) # for 3rd session

# Order by participant
df <- df %>% arrange(participant)

# Remove index
rownames(df) <- NULL

# Add session info
df$session <- as.integer(rep(c('1','2', '3'), 10))

# Show the data frame
kable(df, align = 'c')


## -----------------------------------------------------------------------------

xpectr::set_test_seed(1) # For reproducibility

# Split data in 20/80 (percentage)
partition(df, p = 0.2, id_col = "participant") %>% 
  .[1] %>% # See only the test set 
  kable()  # Pretty tables :) 


## -----------------------------------------------------------------------------

xpectr::set_test_seed(1) # For reproducibility

# Split data in 20/80 (percentage)
parts <- partition(df, p = 0.2, id_col = "participant", cat_col = 'diagnosis')

test_set <- parts[[1]]
train_set <- parts[[2]]

# Show test_set
test_set %>% kable()


## -----------------------------------------------------------------------------
train_set %>% 
  count(diagnosis) %>% 
  kable(align = 'c')

## -----------------------------------------------------------------------------
train_set %>% 
  count(participant) %>% 
  kable(align = 'c')

## -----------------------------------------------------------------------------
xpectr::set_test_seed(1) # For reproducibility

train_set <- fold(train_set, k = 4, cat_col = 'diagnosis', id_col = 'participant')

# Order by .folds
train_set <- train_set %>% arrange(.folds)

train_set %>% kable()

## -----------------------------------------------------------------------------
train_set %>% 
  count(participant, diagnosis) %>% 
  kable(align = 'c')

## -----------------------------------------------------------------------------
# RMSE metric
rmse <- function(predictions, targets){
  sqrt(mean((predictions - targets)^2))
}

crossvalidate <- function(data, k, model, dependent, random = FALSE){
  # 'data' is the training set with the ".folds" column
  # 'k' is the number of folds we have
  # 'model' is a string describing a linear regression model formula
  # 'dependent' is a string with the name of the score column we want to predict
  # 'random' is a logical (TRUE/FALSE); do we have random effects in the model?
  
  # Initialize empty list for recording performances
  performances <- c()
  
  # One iteration per fold
  for (fold in 1:k){
    
    # Create training set for this iteration
    # Subset all the datapoints where .folds does not match the current fold
    training_set <- data[data$.folds != fold,]
    
    # Create test set for this iteration
    # Subset all the datapoints where .folds matches the current fold
    testing_set <- data[data$.folds == fold,]
    
    ## Train model

    # If there is a random effect,
    # use lmer() to train model
    # else use lm()

    if (isTRUE(random)){

      # Train linear mixed effects model on training set
      model <- lmer(model, training_set, REML = FALSE)

    } else {

      # Train linear model on training set
      model <- lm(model, training_set)

    }

    ## Test model

    # Predict the dependent variable in the testing_set with the trained model
    predicted <- predict(model, testing_set, allow.new.levels = TRUE)

    # Get the Root Mean Square Error between the predicted and the observed
    RMSE <- rmse(predicted, testing_set[[dependent]])

    # Add the RMSE to the performance list
    performances[fold] <- RMSE


  }

  # Return the mean of the recorded RMSEs
  return(c('RMSE' = mean(performances)))

}


## ----eval=requireNamespace("broom")-------------------------------------------
lm(score ~ diagnosis, df) %>% 
  broom::tidy()

## -----------------------------------------------------------------------------
m0 <- 'score ~ 1 + (1 | participant)'
m1 <- 'score ~ diagnosis + (1 | participant)'
m2 <- 'score ~ diagnosis + age + (1 | participant)'
m3 <- 'score ~ diagnosis + session + (1 | participant)'
m4 <- 'score ~ diagnosis * session + (1 | participant)'
m5 <- 'score ~ diagnosis * session + age + (1 | participant)'


## ----message=FALSE------------------------------------------------------------
m0
crossvalidate(train_set, k = 4, model = m0, dependent = 'score', random = TRUE)

m1
crossvalidate(train_set, k = 4, model = m1, dependent = 'score', random = TRUE)

m2
crossvalidate(train_set, k = 4, model = m2, dependent = 'score', random = TRUE)

m3
crossvalidate(train_set, k = 4, model = m3, dependent = 'score', random = TRUE)

m4
crossvalidate(train_set, k = 4, model = m4, dependent = 'score', random = TRUE)

m5
crossvalidate(train_set, k = 4, model = m5, dependent = 'score', random = TRUE)


## -----------------------------------------------------------------------------
# Training the model on the full training set
model_m4 <- lmer(m4, train_set, REML = FALSE)

# Predict the dependent variable in the test set with the trained model
predicted <- predict(model_m4, test_set, allow.new.levels = TRUE)

# Get the Root Mean Square Error between the predicted and the observed
RMSE <- rmse(predicted, test_set[['score']])
RMSE

## -----------------------------------------------------------------------------
model_m4 %>%
  summary()

Try the groupdata2 package in your browser

Any scripts or data that you put into this service are public.

groupdata2 documentation built on July 9, 2023, 6:46 p.m.