knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
# Loading all the packages that are needed for this tutorial library(MyStat302Package) library(ggplot2) library(dplyr) library(class) library(randomForest)
For this MyStat302Package, you can install it by
install.packages("MyStat302Package") library(MyStat302Package)
or
devtools::install_github("Chaos-Gao/MyStat302Package") library(MyStat302Package)
In this package, it includes six functions, which are my_pow, f_to_c, my_t.test, my_lm, my_knn_cv, and my_rf_cv. Below contains four tutorials for my_t.test, my_lm, my_knn_cv, and my_rf_cv respectively.
# A two tail t-test, with 60 as null hypothesis mean value, and alternative # hypothesis is mu not equal to 60 lifeExp_data <- my_gapminder$lifeExp my_t.test(lifeExp_data, mu = 60)
# A left tail t-test, with 60 as null hypothesis mean value, and alternative # hypothesis is mu smaller than 60 my_t.test(lifeExp_data, alternative = "less", mu = 60)
# A right tail t-test, with 60 as null hypothesis mean value, and alternative # hypothesis is mu greater than 60 my_t.test(lifeExp_data, alternative = "greater", mu = 60)
# Operate the linear regression linear_pred <- my_lm(lifeExp ~ continent + gdpPercap, data = my_gapminder) linear_pred
# Extract the coefficients my_estimates <- linear_pred[, 1] # Extract X variable my_X <- model.matrix(lifeExp ~ continent + gdpPercap, my_gapminder) # Calculate the fitted data lifeExp_fit <- my_X %*% my_estimates # Drew the Actual vs. Fitted plot my_df <- data.frame(actual = my_gapminder$lifeExp, fitted = lifeExp_fit, continent = my_gapminder$continent) ggplot(my_df, aes(x = fitted, y = actual, color = continent)) + geom_point() + geom_abline(slope = 1, intercept = 0, col = "red", lty = 2) + theme_bw(base_size = 15) + labs(x = "Fitted values", y = "Actual values", title = "Actual vs. Fitted") + theme(plot.title = element_text(hjust = 0.5))
# Remove NA in Data penguins_data <- na.omit(my_penguins) # Get the train and cl parameter from the data train <- penguins_data[, 3:6] cl <- penguins_data$species # The list record the prediction pred_class_list <- list() # The vector record the test error cv_err_vec <- vector() # Do the knn prediction from 1 nearest neighbor to 10 nearest neighbors for (j in 1:10) { temp <- my_knn_cv(train, cl, j, 5) pred_class_list[[j]] <- temp[[1]] cv_err_vec[j] <- temp[[2]] } # Calculate the training error train_err_vec <- vector() for (i in 1:10) { train_err_vec[i] <- sum(pred_class_list[[i]] != penguins_data$species) / length(pred_class_list[[i]]) } # Construct a table showing both the training and testing error compare_err <- cbind.data.frame(cv_err_vec, train_err_vec) compare_err
# Do the CV test with k = 2, 5, 8, and for each k, run 30 times fold_vec <- c(2, 5, 8) cv_err_mat <- matrix(nrow = 30, ncol = length(fold_vec)) for (i in 1:length(fold_vec)) { for (j in 1:30) { cv_err_mat[j, i] <- my_rf_cv(fold_vec[i]) } }
# Drew the three boxplots cv_err_data <- as.data.frame(cv_err_mat) colnames(cv_err_data) <- c("Fold2", "Fold5", "Fold8") ggplot(data = cv_err_data) + geom_boxplot(aes(x = "k = 2", y = Fold2)) + geom_boxplot(aes(x = "k = 5", y = Fold5)) + geom_boxplot(aes(x = "k = 8", y = Fold8)) + theme_bw(base_size = 10) + labs(title = "The MSE Distribution for k-folds Cross Validation", x = "Number of Folds", y = "CV Estimated MSE ") + theme(plot.title = element_text(hjust = 0.5))
# Calculate the mean and sd of the MSE mean_err_vec <- colMeans(cv_err_mat) sd_err_vec <- vector() for (i in 1:3) { sd_err_vec[i] <- sd(cv_err_mat[, i]) } # Show the mean and sd in a table cv_mean_sd <- cbind.data.frame(mean_err_vec, sd_err_vec) cv_mean_sd
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.