knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
# Loading all the packages that are needed for this tutorial
library(MyStat302Package)
library(ggplot2)
library(dplyr)
library(class)
library(randomForest)

Brief Introduction

For this MyStat302Package, you can install it by

install.packages("MyStat302Package")
library(MyStat302Package)

or

devtools::install_github("Chaos-Gao/MyStat302Package")
library(MyStat302Package)

In this package, it includes six functions, which are my_pow, f_to_c, my_t.test, my_lm, my_knn_cv, and my_rf_cv. Below contains four tutorials for my_t.test, my_lm, my_knn_cv, and my_rf_cv respectively.

my_t.test Tutorial

# A two tail t-test, with 60 as null hypothesis mean value, and alternative 
# hypothesis is mu not equal to 60
lifeExp_data <- my_gapminder$lifeExp
my_t.test(lifeExp_data, mu = 60)
# A left tail t-test, with 60 as null hypothesis mean value, and alternative 
# hypothesis is mu smaller than 60
my_t.test(lifeExp_data, alternative = "less", mu = 60)
# A right tail t-test, with 60 as null hypothesis mean value, and alternative 
# hypothesis is mu greater than 60
my_t.test(lifeExp_data, alternative = "greater", mu = 60)

my_lm Tutorial

# Operate the linear regression
linear_pred <- my_lm(lifeExp ~ continent + gdpPercap, data = my_gapminder)
linear_pred
# Extract the coefficients
my_estimates <- linear_pred[, 1]
# Extract X variable
my_X <- model.matrix(lifeExp ~ continent + gdpPercap, my_gapminder)
# Calculate the fitted data
lifeExp_fit <- my_X %*% my_estimates

# Drew the Actual vs. Fitted plot
my_df <- data.frame(actual = my_gapminder$lifeExp, fitted = lifeExp_fit, 
                    continent = my_gapminder$continent)
ggplot(my_df, aes(x = fitted, y = actual, color = continent)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, col = "red", lty = 2) + 
  theme_bw(base_size = 15) +
  labs(x = "Fitted values", y = "Actual values", title = "Actual vs. Fitted") +
  theme(plot.title = element_text(hjust = 0.5))

my_knn_cv Tutorial

# Remove NA in Data
penguins_data <- na.omit(my_penguins)
# Get the train and cl parameter from the data
train <- penguins_data[, 3:6]
cl <- penguins_data$species

# The list record the prediction
pred_class_list <- list()
# The vector record the test error
cv_err_vec <- vector()
# Do the knn prediction from 1 nearest neighbor to 10 nearest neighbors
for (j in 1:10) {
  temp <- my_knn_cv(train, cl, j, 5)
  pred_class_list[[j]] <- temp[[1]]
  cv_err_vec[j] <- temp[[2]]
}

# Calculate the training error
train_err_vec <- vector()
for (i in 1:10) {
  train_err_vec[i] <- sum(pred_class_list[[i]] != penguins_data$species) / 
    length(pred_class_list[[i]])
}

# Construct a table showing both the training and testing error
compare_err <- cbind.data.frame(cv_err_vec, train_err_vec)
compare_err

my_rf_cv Tutorial

# Do the CV test with k = 2, 5, 8, and for each k, run 30 times
fold_vec <- c(2, 5, 8)
cv_err_mat <- matrix(nrow = 30, ncol = length(fold_vec))
for (i in 1:length(fold_vec)) {
  for (j in 1:30) {
    cv_err_mat[j, i] <- my_rf_cv(fold_vec[i])
  }
}
# Drew the three boxplots
cv_err_data <- as.data.frame(cv_err_mat)
colnames(cv_err_data) <- c("Fold2", "Fold5", "Fold8")
ggplot(data = cv_err_data) + 
  geom_boxplot(aes(x = "k = 2", y = Fold2)) +
  geom_boxplot(aes(x = "k = 5", y = Fold5)) +
  geom_boxplot(aes(x = "k = 8", y = Fold8)) +
  theme_bw(base_size = 10) +
  labs(title = "The MSE Distribution for k-folds Cross Validation",
       x = "Number of Folds",
       y = "CV Estimated MSE ") +
  theme(plot.title =
        element_text(hjust = 0.5))
# Calculate the mean and sd of the MSE
mean_err_vec <- colMeans(cv_err_mat)
sd_err_vec <- vector()
for (i in 1:3) {
  sd_err_vec[i] <- sd(cv_err_mat[, i])
}

# Show the mean and sd in a table
cv_mean_sd <- cbind.data.frame(mean_err_vec, sd_err_vec)
cv_mean_sd


Chaos-Gao/MyStat302Package documentation built on March 20, 2021, 2 p.m.