knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
usethis::use_package("CodingProject3") usethis::use_package("ggplot2") data.name.vec <- c( "spam", "zip.train", "prostate", "ozone") data.list <- list() library(ElemStatLearn) for(data.name in data.name.vec) { data(list = data.name, package="ElemStatLearn") data.list[[data.name]] <- get(data.name) } is.binary <- ElemStatLearn::zip.train[,1] %in% c(0,1) data.list <- list( spam=list( label.vec=ifelse(ElemStatLearn::spam$spam=="spam", 1, 0), feature.mat=as.matrix(ElemStatLearn::spam[, 1:57])), SAheart=list( label.vec=ifelse(ElemStatLearn::SAheart$SAheart=="chd", 1, 0), feature.mat=as.matrix(ElemStatLearn::SAheart[, -ncol(ElemStatLearn::SAheart)])), zip.train=list( label.vec=ElemStatLearn::zip.train[is.binary,1], feature.mat=ElemStatLearn::zip.train[is.binary,-1]), prostate=list( label.vec=as.numeric(ElemStatLearn::prostate$lpsa), feature.mat=as.matrix(ElemStatLearn::prostate[,-c(9, 10)])), ozone=list( label.vec=ElemStatLearn::ozone$ozone, feature.mat=as.matrix(ElemStatLearn::ozone[,-1])) ) n.folds <- 4 resultsList <- list() baseline <- list() mean.loss.list <- list() #current.set.matrix <- str(data.list) for(data.name in names(data.list)) { one.data.set <- data.list[[data.name]] set.seed(1) # print("This is one.data.set") # print(one.data.set) # print(data.list[[data.name]]) # print(names(data.list)) #print(one.data.set$feature.mat[1:nrow(one.data.set$feature.mat)]) #print(one.data.set[1:nrow(one.data.set$feature.mat), -1]) # variables for function if(data.name == 'ozone'){ X.mat <- one.data.set$feature.mat y.vec <- one.data.set$label.vec fold.vec <- sample(rep(1:n.folds, l=nrow(X.mat))) max.ite <- 50 result.mat.list <- list() for(test.fold in 1:length(unique(fold.vec))) { is.test <- fold.vec == test.fold is.train <- !is.test fold_data <- which(fold.vec == test.fold) X.train <- X.mat[-fold_data ,] X.valid <- X.mat[fold_data ,] y.train <- y.vec[-fold_data] y.valid <- y.vec[fold_data] fold.vec.train <- sample(rep(1:n.folds, l=nrow(X.train))) #For each train/test split, to show that your algorithm is actually learning something non-trivial from the inputs/features, compute a baseline predictor that ignores the inputs/features. #Regression: the mean of the training labels/outputs. baseline <- mean(y.train) # print(data.name) # print(data.list[[data.name]]) fold.v <- sample(rep(1:4, l=nrow(X.train))) resultES <- CodingProject3::NNetEarlyStoppingCV(X.train, y.train, fold.v, max.iterations=max.ite, step.size=0.05, n.hidden.units=30) } # pred.prob.list[test.fold] <- list( # earlyStopping=resultES$predict(one.data.set$feature.mat[is.test,]), # #L2regularized=resultREG$predict(one.data.set$feature.mat[is.test,]), # baseline=rep(baseline, sum(is.test))) # For each data set, compute a 4 x 3 matrix of mean test loss values: # each of the four rows are for a specific test set, # the first column is for the early stopping predictor, # the second column is for the L2 regularized predictor, # the third column is for the baseline/un-informed predictor. # mat <- cbind(resultES$penalty.vec, result$penalty.vec) # str(pred.prob.list) # test.fold.result.list <- list() # for(algo in names(pred.prob.list)) # { # pred.prob.vec <- pred.prob.list[[algo]] # pred.class.vec <- ifelse(pred.prob.vec > 0.5, 1, 0) # test.fold.result.list[[algo]] <- mean(data.set$labels[is.test] != #pred.class.vec) # } } } # For spam SAheart and train # else # { #For each train/test split, to show that your algorithm is actually learning something non-trivial from the inputs/features, compute a baseline predictor that ignores the inputs/features. # Binary classification: the most frequent class/label/output in the training data. #prdictionBase <- mfv(fold.vec) #baseline.append(mfv(fold.vec)) # resultES <- LMLogisticLossEarlyStoppingCV(X.mat, # y.vec, # fold.vec, #max.iterations) # result <- LMLogisticLossIterations(X.mat, # y.vec, # fold.vec, # max.iterations) # resultList.append(resultES) # resultList.append(result) # } # For each data set, compute a 4 x 3 matrix of mean test loss values: # each of the four rows are for a specific test set, # the first column is for the early stopping predictor, # the second column is for the L2 regularized predictor, # the third column is for the baseline/un-informed predictor. #mat <- cbind(resultES$penalty.vec, result$penalty.vec) # List is (LMLogisticLossEarlyStoppingCV, LMLogisticLossIterations for spam SAheart and then train, # Then LMSquareLossEarlyStoppingCV LMSquareLossIterations for prostate and ozone)
# ggplot(resultsList[5]$train.loss.vec) # ggplot(resultsList[5]$validation.loss.vec) # ggplot(resultsList[6]$train.loss.vec) # ggplot(resultsList[6]$validation.loss.vec)
#print out and/or plot the matrix of loss values # pander::pandoc.table(resultsList[1]$train.loss.mat) # pander::pandoc.table(resultsList[2]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. plot(resultES$mean.train.loss.vec) plot(resultES$mean.validation.loss) # ggplot(resultsList[2]$train.loss.vec) # ggplot(resultsList[2]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. # pander::pandoc.table(resultsList[3]$train.loss.mat) # pander::pandoc.table(resultsList[4]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. # ggplot(resultsList[3]$train.loss.vec) # ggplot(resultsList[3]$validation.loss.vec) # ggplot(resultsList[4]$train.loss.vec) # ggplot(resultsList[4]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. # pander::pandoc.table(resultsList[5]$train.loss.mat) # pander::pandoc.table(resultsList[6]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. #plot the two loss functions. # ggplot(resultsList[5]$train.loss.vec) # ggplot(resultsList[5]$validation.loss.vec) # ggplot(resultsList[6]$train.loss.vec) # ggplot(resultsList[6]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. # pander::pandoc.table(resultsList[7]$train.loss.mat) # pander::pandoc.table(resultsList[8]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. # ggplot(resultsList[7]$train.loss.vec) # ggplot(resultsList[7]$validation.loss.vec) # ggplot(resultsList[8]$train.loss.vec) # ggplot(resultsList[8]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. # pander::pandoc.table(resultsList[9]$train.loss.mat) # pander::pandoc.table(resultsList[10]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. # ggplot(resultsList[9]$train.loss.vec) # ggplot(resultsList[9]$validation.loss.vec) # ggplot(resultsList[10]$train.loss.vec) # ggplot(resultsList[10]$validation.loss.vec)
What are the optimal regularization parameters?
$L(w,v) = \frac{1}2 \times [\Sigma^{n}{1}[w^{t}S[V^{T}x{i}] - y_{i}]^2]$
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.