random_k_cross_validation <- function (k = 10, kn = round(nrow(dataset)/k), dataset, model_formula = NULL,
response, model_type)
{
# library(kernlab)
# library(earth)
# FUNCTION OVERVIEW
# NOTE: this is purely for linear models and prediction
# It does not support classification or other models yet
# This cross-validation creates k random subsets from the data
# The subsets represent the testing data
# The linear model is trained on the remaining data
# Then this model predicts the subset
# the predictions and observations are saved in a data frame
# this data frame is returned from the function
# FUNCTION INPUTS
# K represents the number of subsets / folds to be created
# kn represents the size of each subset / fold
# the dataset is the data being modelled
# the model_formula is the formula of the model
# the response is a numeric vector of the response variable
# model_type is a character input representing the model being used e.g. "lm", "earth", or "svm"
test_observations <- numeric(0)
test_predictions <- numeric(0)
test_error <- numeric(0)
model_observations <- numeric(0)
model_fitted_values <- numeric(0)
model_residuals <- numeric(0)
# PART 1: THE FOR LOOP AND DERIVING THE DATA
for (i in 1:k) {
# sample indexes from the dataset
ki <- sample(x = 1:nrow(dataset), size = kn)
# subset testing data according to the indexes
test_data <- dataset[ki, ]
# subset training data acording to the indexes
train_data <- dataset[-ki, ]
if (model_type == "lm") {
# train the given model using the training data
model <- lm(formula = model_formula, data = train_data)
} else if (model_type == "svm") {
# train the given model using the training data
model <- lm(formula = model_formula, data = train_data, scale = T, type = "eps-svr",
kernel = "rbfdot", fit = T)
} else if (model_type == "earth") {
# train the given model using the training data
model <- earth(formula = model_formula, data = train_data, pmethod = "forward")
}
# predict the test data and update the test data vector
test_predictions <- c(test_predictions, predict(model, test_data))
# save the test observations to the test observations vector
test_observations <- c(test_observations, response[ki])
# save the prediction error to the test erro vector
test_error <- c(test_error, predict(model, test_data) - response[ki])
# update the model observations
model_observations <- c(model_observations, response[-ki])
# update the model fitted values
model_fitted_values <- c(model_fitted_values, model$fitted.values)
# update model residuals
model_residuals <- c(model_residuals, model$residuals)
}
# PART 2: CREATING THE LIST
# This part creates the two data frames from the for loop data
# and outputs them to a list
# create a data frame to hold the predicitons and observations
obs_preds_df <- as.data.frame(cbind(test_observations, test_predictions, test_error))
# rename the columns of the data frame
colnames(obs_preds_df) <- c("Observations", "Predicitons", "Error")
# create a data frame to hold the fitted values and residuals of each model
fits_res_df <- as.data.frame(cbind(model_observations, model_fitted_values, model_residuals))
# rename the columns of the data frame
colnames(fits_res_df) <- c("Observations", "Fits", "Residuals")
# save the cross validation list
cv_list <- list(obs_preds_df, fits_res_df)
#return(cv_list)
# PART 3: SUMMARY DATA FRAME
# This part creates a summary data frame using the cross-validation list
# the summary data frame will have two rows
# the first row represents the metrics related to the test data i.e. observations, predictions and error
# the second row represents the metrics related to the model data i.e. observations, fitted values and residuals
summary_df <- as.data.frame(matrix(nrow = 2, ncol = 7))
# Add in the colnames
colnames(summary_df) <- c("SSE","MSE", "RMSE", "SAE", "MAE", "RMAE", "Rsq")
rownames(summary_df) <- c("Test_Data", "Model_Data")
# ROW 1: TEST DATA
# Sum Squared Error of Model predictions and observations
summary_df$SSE[1] <- sum((cv_list[[1]]$Error)^2)
# sum Absolute Error
summary_df$SAE[1] <- sum(abs(cv_list[[1]]$Error))
# Mean Absolute Error of model predictions and observations
summary_df$MAE[1] <- mean(abs(cv_list[[1]]$Error))
# Root Mean Absolute Error of model predictions and observations
summary_df$RMAE[1] <- sqrt(mean(abs(cv_list[[1]]$Error)))
# Mean Squared Error of model predictions and observations
summary_df$MSE[1] <- mean((cv_list[[1]]$Error)^2)
# Root Mean Squared Error of model predictions and observations
summary_df$RMSE[1] <- sqrt(mean((cv_list[[1]]$Error)^2))
# R Squared of model predictions and observations
summary_df$Rsq[1] <- 1 - sum((cv_list[[1]]$Error)^2) /
sum((cv_list[[1]]$Observations - mean(cv_list[[1]]$Predicitons))^2)
# ROW 2: MODEL DATA
# R squared value of model predictions and observations
# Sum Squared Error of Model predictions and observations
summary_df$SSE[2] <- sum((cv_list[[2]]$Residuals)^2)
# sum Absolute Error
summary_df$SAE[2] <- sum(abs(cv_list[[2]]$Residuals))
# Mean Absolute Error of model predictions and observations
summary_df$MAE[2] <- mean(abs(cv_list[[2]]$Residuals))
# Root Mean Absolute Error of model predictions and observations
summary_df$RMAE[2] <- sqrt(mean(abs(cv_list[[2]]$Residuals)))
# Mean Squared Error of model predictions and observations
summary_df$MSE[2] <- mean((cv_list[[2]]$Residuals)^2)
# Root Mean Squared Error of model predictions and observations
summary_df$RMSE[2] <- sqrt(mean((cv_list[[2]]$Residuals)^2))
# R Squared of model predictions and observations
summary_df$Rsq[2] <- 1 - sum((cv_list[[2]]$Residuals)^2) /
sum((cv_list[[2]]$Observations - mean(cv_list[[2]]$Fits))^2)
# Update the cross validation list with the summary data frame
cv_list <- list(obs_preds_df, fits_res_df, summary_df)
# output the cross validation list
return(cv_list)
}
cross_validation <- function(dataset, model_formula, response, k = 10){
# Default is 10-fold cross validation
# takes a datset and divides it into k subsets
# trains models on 9 subsets and tests on the remaining subset
# model_formula is given in order to construct models within the function
# response is the response variable as a numeric vector
ki <- floor((nrow(dataset)/k))
test_predictions <- numeric(0)
test_observations <- numeric(0)
k <- k + 1
for (i in 1:k) {
if (i == 1) {
# subset testing data
test_data <- dataset[1:ki, ]
# subset training data
train_data <- dataset[-(1:ki), ]
# fit model to training data
model <- lm(formula = model_formula, data = train_data)
# predict the test data
test_predictions_a <- predict(model, test_data)
# update test predictions vector
test_predictions <- c(test_predictions,test_predictions_a)
# save test observations
test_observations_a <- response[1:ki]
# update test observations vector
test_observations <- c(test_observations, test_observations_a)
} else if (i != 1 & i != k) {
# subset testing data
test_data <- dataset[seq(from = (ki*(i - 1) + 1), to = (ki*(i)), by = 1), ]
# subset training data
train_data <- dataset[-(seq(from = (ki*(i - 1) + 1), to = (ki*(i)), by = 1)), ]
# fit model to training data
model <- lm(formula = model_formula, data = train_data)
# predict the test data
test_predictions_b <- predict(model, test_data)
# update test predictions vector
test_predictions <- c(test_predictions, test_predictions_b)
# save test observations
test_observations_b <- response[(ki*(i - 1) + 1)]
# update test observations vector
test_observations <- c(test_observations, test_observations_b)
} else if (i == k) {
# subset testing data
test_data <- dataset[seq(from = (ki*(i - 1) + 1), to = nrow(dataset), by = 1), ]
# test_data <- dataset[(ki*(i - 1) + 1):nrow(dataset), ]
# subset training data
train_data <- dataset[-(seq(from = (ki*(i - 1) + 1), to = nrow(dataset), by = 1)), ]
# train_data <- dataset[-((ki*(i - 1) + 1):nrow(dataset)), ]
# fit model to training data
model <- lm(formula = model_formula, data = train_data)
# predict the test data
test_predictions_c <- predict(model, test_data)
# update test predictions vector
test_predictions <- c(test_predictions,test_predictions_c)
# save test observations
test_observations_c <- response[(ki*(i - 1) + 1)]
# update test observations vector
test_observations <- c(test_observations, test_observations_c)
}
}
test_df <- as.data.frame(cbind(test_observations, test_predictions))
colnames(test_df) <- c("Observations", "Predicitons")
# head(test_df)
return(test_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.