R/cross_validate.R

random_k_cross_validation <- function (k = 10, kn = round(nrow(dataset)/k), dataset, model_formula = NULL, 
                                       response, model_type) 
{
  # library(kernlab)
  # library(earth)
  # FUNCTION OVERVIEW
  # NOTE: this is purely for linear models and prediction
  # It does not support classification or other models yet 
  # This cross-validation creates k random subsets from the data
  # The subsets represent the testing data
  # The linear model is trained on the remaining data
  # Then this model predicts the subset
  # the predictions and observations are saved in a data frame
  # this data frame is returned from the function
  # FUNCTION INPUTS
  # K represents the number of subsets / folds to be created
  # kn represents the size of each subset / fold
  # the dataset is the data being modelled
  # the model_formula is the formula of the model
  # the response is a numeric vector of the response variable
  # model_type is a character input representing the model being used e.g. "lm", "earth", or "svm"
  test_observations <- numeric(0)
  test_predictions <- numeric(0)
  test_error <- numeric(0)
  model_observations <- numeric(0)
  model_fitted_values <- numeric(0)
  model_residuals <- numeric(0)
  
  # PART 1: THE FOR LOOP AND DERIVING THE DATA
  
  for (i in 1:k) {
    # sample indexes from the dataset
    ki <- sample(x = 1:nrow(dataset), size = kn)
    # subset testing data according to the indexes
    test_data <- dataset[ki, ]
    # subset training data acording to the indexes
    train_data <- dataset[-ki, ]
    if (model_type == "lm") {
      # train the given model using the training data
      model <- lm(formula = model_formula, data = train_data)
    } else if (model_type == "svm") {
      # train the given model using the training data
      model <- lm(formula = model_formula, data = train_data, scale = T, type = "eps-svr",
                  kernel = "rbfdot", fit = T)
    } else if (model_type == "earth") {
      # train the given model using the training data
      model <- earth(formula = model_formula, data = train_data, pmethod = "forward")
    }
    # predict the test data and update the test data vector
    test_predictions <- c(test_predictions, predict(model, test_data))
    # save the test observations to the test observations vector
    test_observations <- c(test_observations, response[ki])
    # save the prediction error to the test erro vector
    test_error <- c(test_error, predict(model, test_data) - response[ki])
    # update the model observations
    model_observations <- c(model_observations, response[-ki])
    # update the model fitted values
    model_fitted_values <- c(model_fitted_values, model$fitted.values)
    # update model residuals
    model_residuals <- c(model_residuals, model$residuals)
  }
  
  # PART 2: CREATING THE LIST
  
  # This part creates the two data frames from the for loop data
  # and outputs them to a list
  # create a data frame to hold the predicitons and observations
  obs_preds_df <- as.data.frame(cbind(test_observations, test_predictions, test_error))
  # rename the columns of the data frame
  colnames(obs_preds_df) <- c("Observations", "Predicitons", "Error")
  # create a data frame to hold the fitted values and residuals of each model
  fits_res_df <- as.data.frame(cbind(model_observations, model_fitted_values, model_residuals))
  # rename the columns of the data frame
  colnames(fits_res_df) <- c("Observations", "Fits", "Residuals")
  # save the cross validation list
  cv_list <- list(obs_preds_df, fits_res_df)
  #return(cv_list)
  
  # PART 3: SUMMARY DATA FRAME
  
  # This part creates a summary data frame using the cross-validation list
  # the summary data frame will have two rows
  # the first row represents the metrics related to the test data i.e. observations, predictions and error
  # the second row represents the metrics related to the model data i.e. observations, fitted values and residuals
  summary_df <- as.data.frame(matrix(nrow = 2, ncol = 7))
  # Add in the colnames
  colnames(summary_df) <- c("SSE","MSE", "RMSE", "SAE", "MAE", "RMAE", "Rsq")
  rownames(summary_df) <- c("Test_Data", "Model_Data")
  
  # ROW 1: TEST DATA  
  # Sum Squared Error of Model predictions and observations
  summary_df$SSE[1] <- sum((cv_list[[1]]$Error)^2)
  # sum Absolute Error
  summary_df$SAE[1] <- sum(abs(cv_list[[1]]$Error))
  # Mean Absolute Error of model predictions and observations
  summary_df$MAE[1] <- mean(abs(cv_list[[1]]$Error))
  # Root Mean Absolute Error of model predictions and observations
  summary_df$RMAE[1] <- sqrt(mean(abs(cv_list[[1]]$Error)))
  # Mean Squared Error of model predictions and observations
  summary_df$MSE[1] <- mean((cv_list[[1]]$Error)^2)
  # Root Mean Squared Error of model predictions and observations
  summary_df$RMSE[1] <- sqrt(mean((cv_list[[1]]$Error)^2))
  # R Squared of model predictions and observations
  summary_df$Rsq[1] <-  1 - sum((cv_list[[1]]$Error)^2) / 
    sum((cv_list[[1]]$Observations - mean(cv_list[[1]]$Predicitons))^2)
  
  # ROW 2: MODEL DATA
  # R squared value of model predictions and observations
  # Sum Squared Error of Model predictions and observations
  summary_df$SSE[2] <- sum((cv_list[[2]]$Residuals)^2)
  # sum Absolute Error
  summary_df$SAE[2] <- sum(abs(cv_list[[2]]$Residuals))
  # Mean Absolute Error of model predictions and observations
  summary_df$MAE[2] <- mean(abs(cv_list[[2]]$Residuals))
  # Root Mean Absolute Error of model predictions and observations
  summary_df$RMAE[2] <- sqrt(mean(abs(cv_list[[2]]$Residuals)))
  # Mean Squared Error of model predictions and observations
  summary_df$MSE[2] <- mean((cv_list[[2]]$Residuals)^2)
  # Root Mean Squared Error of model predictions and observations
  summary_df$RMSE[2] <- sqrt(mean((cv_list[[2]]$Residuals)^2))
  # R Squared of model predictions and observations
  summary_df$Rsq[2] <-  1 - sum((cv_list[[2]]$Residuals)^2) / 
    sum((cv_list[[2]]$Observations - mean(cv_list[[2]]$Fits))^2)
  
  # Update the cross validation list with the summary data frame
  cv_list <- list(obs_preds_df, fits_res_df, summary_df)
  # output the cross validation list
  return(cv_list)
}




cross_validation <- function(dataset, model_formula, response, k = 10){
  # Default is 10-fold cross validation
  # takes a datset and divides it into k subsets
  # trains models on 9 subsets and tests on the remaining subset
  # model_formula is given in order to construct models within the function
  # response is the response variable as a numeric vector
  ki <- floor((nrow(dataset)/k))
  test_predictions <- numeric(0)
  test_observations <- numeric(0)
  k <- k + 1
  for (i in 1:k) {
    if (i == 1) {
      
      # subset testing data
      test_data <- dataset[1:ki, ]
      # subset training data
      train_data <- dataset[-(1:ki), ]
      # fit model to training data
      model <- lm(formula = model_formula, data = train_data)
      # predict the test data
      test_predictions_a <- predict(model, test_data)
      # update test predictions vector
      test_predictions <- c(test_predictions,test_predictions_a)
      # save test observations
      test_observations_a  <- response[1:ki]
      # update test observations vector
      test_observations <- c(test_observations, test_observations_a)
    } else if (i != 1 & i != k) {
      
      # subset testing data
      test_data <- dataset[seq(from = (ki*(i - 1) + 1), to = (ki*(i)), by = 1), ]
      # subset training data
      train_data <- dataset[-(seq(from = (ki*(i - 1) + 1), to = (ki*(i)), by = 1)), ]
      # fit model to training data
      model <- lm(formula = model_formula, data = train_data)
      # predict the test data
      test_predictions_b <- predict(model, test_data)
      # update test predictions vector
      test_predictions <- c(test_predictions, test_predictions_b)
      # save test observations
      test_observations_b  <- response[(ki*(i - 1) + 1)]
      # update test observations vector
      test_observations <- c(test_observations, test_observations_b)
      
    } else if (i == k) {
      
      # subset testing data
      test_data <- dataset[seq(from = (ki*(i - 1) + 1), to = nrow(dataset), by = 1), ]
      # test_data <- dataset[(ki*(i - 1) + 1):nrow(dataset), ]
      # subset training data
      train_data <- dataset[-(seq(from = (ki*(i - 1) + 1), to = nrow(dataset), by = 1)), ]
      # train_data <- dataset[-((ki*(i - 1) + 1):nrow(dataset)), ]
      # fit model to training data
      model <- lm(formula = model_formula, data = train_data)
      # predict the test data
      test_predictions_c <- predict(model, test_data)
      # update test predictions vector
      test_predictions <- c(test_predictions,test_predictions_c)
      # save test observations
      test_observations_c  <- response[(ki*(i - 1) + 1)]
      # update test observations vector
      test_observations <- c(test_observations, test_observations_c)
      
    }
  }
  test_df <- as.data.frame(cbind(test_observations, test_predictions))
  colnames(test_df) <- c("Observations", "Predicitons")
  # head(test_df) 
  return(test_df)
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.