title: "Vignette Title"
author: "Vignette Author"
date: "r Sys.Date()
"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Vignette Title}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
usethis::use_package("codingProject2") #usethis::use_package("modeest") data.name.vec <- c( "spam", "SAheart", "zip.train", "prostate", "ozone") data.list <- list() library(ElemStatLearn) for(data.name in data.name.vec) { data(list = data.name, package="ElemStatLearn") data.list[[data.name]] <- get(data.name) } str(data.list) is.binary <- ElemStatLearn::zip.train[,1] %in% c(0,1) data.list <- list( spam=list( label.vec=ifelse(ElemStatLearn::spam$spam=="spam", 1, 0), feature.mat=as.matrix(ElemStatLearn::spam[, 1:57])), SAheart=list( label.vec=ifelse(ElemStatLearn::SAheart$SAheart=="chd", 1, 0), feature.mat=as.matrix(ElemStatLearn::SAheart[, -ncol(ElemStatLearn::SAheart)])), zip.train=list( label.vec=ElemStatLearn::zip.train[is.binary,1], feature.mat=ElemStatLearn::zip.train[is.binary,-1]), prostate=list( label.vec=ElemStatLearn::prostate$lpsa, feature.mat=as.matrix(ElemStatLearn::prostate[, -c(9,10)])), ozone=list( label.vec=ElemStatLearn::ozone$ozone, feature.mat=as.matrix(ElemStatLearn::ozone[, -ncol(ElemStatLearn::ozone)])) ) n.folds <- 4 resultsList <- list() baseline <- list() mean.loss.list <- list() #current.set.matrix <- for(data.name in names(data.list)) { one.data.set <- data.list[[data.name]] set.seed(1) # print("This is one.data.set") # print(one.data.set) # print(data.list[[data.name]]) # print(names(data.list)) #print(one.data.set$feature.mat[1:nrow(one.data.set$feature.mat)]) #print(one.data.set[1:nrow(one.data.set$feature.mat), -1]) # variables for function X.mat <- one.data.set$feature.mat y.vec <- one.data.set$label.vec fold.vec <- sample(rep(1:n.folds, l=nrow(X.mat))) max.iterations <- 30 penalty.vec <- c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5) result.mat.list <- list() for(test.fold in 1:n.folds) { is.test <- fold.vec == test.fold is.train <- !is.test X.train <- X.mat[is.train,] y.train <- y.vec[is.train] fold.vec.train <- sample(rep(1:n.folds, l=nrow(X.train))) print(test.fold) # print(y.train) # print(fold.vec) if(data.name == "prostate" || data.name == "ozone") { #For each train/test split, to show that your algorithm is actually learning something non-trivial from the inputs/features, compute a baseline predictor that ignores the inputs/features. #Regression: the mean of the training labels/outputs. baseline <- mean(y.train) # print(data.name) # print(data.list[[data.name]]) resultES <- codingProject2::LMSquareLossEarlyStoppingCV(X.train, y.train, fold.vec.train, max.iterations) #resultREG <- codingProject2::LMSquareLossL2CV(X.train, # y.train, # fold.vec.train, # penalty.vec) str(resultES) #str(resultREG) pred.prob.list[test.fold] <- list( earlyStopping=resultES$predict(one.data.set$feature.mat[is.test,]), #L2regularized=resultREG$predict(one.data.set$feature.mat[is.test,]), baseline=rep(baseline, sum(is.test))) print(pred.prob.list[[test.fold]]) # For each data set, compute a 4 x 3 matrix of mean test loss values: # each of the four rows are for a specific test set, # the first column is for the early stopping predictor, # the second column is for the L2 regularized predictor, # the third column is for the baseline/un-informed predictor. # mat <- cbind(resultES$penalty.vec, result$penalty.vec) # str(pred.prob.list) # test.fold.result.list <- list() # for(algo in names(pred.prob.list)) # { # pred.prob.vec <- pred.prob.list[[algo]] # pred.class.vec <- ifelse(pred.prob.vec > 0.5, 1, 0) # test.fold.result.list[[algo]] <- mean(data.set$labels[is.test] != #pred.class.vec) # } } } # For spam SAheart and train # else # { #For each train/test split, to show that your algorithm is actually learning something non-trivial from the inputs/features, compute a baseline predictor that ignores the inputs/features. # Binary classification: the most frequent class/label/output in the training data. #prdictionBase <- mfv(fold.vec) #baseline.append(mfv(fold.vec)) # resultES <- LMLogisticLossEarlyStoppingCV(X.mat, # y.vec, # fold.vec, #max.iterations) # result <- LMLogisticLossIterations(X.mat, # y.vec, # fold.vec, # max.iterations) # resultList.append(resultES) # resultList.append(result) # } # For each data set, compute a 4 x 3 matrix of mean test loss values: # each of the four rows are for a specific test set, # the first column is for the early stopping predictor, # the second column is for the L2 regularized predictor, # the third column is for the baseline/un-informed predictor. #mat <- cbind(resultES$penalty.vec, result$penalty.vec) # List is (LMLogisticLossEarlyStoppingCV, LMLogisticLossIterations for spam SAheart and then train, # Then LMSquareLossEarlyStoppingCV LMSquareLossIterations for prostate and ozone) }
#print out and/or plot the matrix of loss values pander::pandoc.table(resultsList[1]$train.loss.mat) pander::pandoc.table(resultsList[2]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[1]$train.loss.vec) ggplot(resultsList[1]$validation.loss.vec) ggplot(resultsList[2]$train.loss.vec) ggplot(resultsList[2]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. pander::pandoc.table(resultsList[3]$train.loss.mat) pander::pandoc.table(resultsList[4]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[3]$train.loss.vec) ggplot(resultsList[3]$validation.loss.vec) ggplot(resultsList[4]$train.loss.vec) ggplot(resultsList[4]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. pander::pandoc.table(resultsList[5]$train.loss.mat) pander::pandoc.table(resultsList[6]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. #plot the two loss functions. ggplot(resultsList[5]$train.loss.vec) ggplot(resultsList[5]$validation.loss.vec) ggplot(resultsList[6]$train.loss.vec) ggplot(resultsList[6]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. pander::pandoc.table(resultsList[7]$train.loss.mat) pander::pandoc.table(resultsList[8]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[7]$train.loss.vec) ggplot(resultsList[7]$validation.loss.vec) ggplot(resultsList[8]$train.loss.vec) ggplot(resultsList[8]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
#print out and/or plot the matrix. pander::pandoc.table(resultsList[9]$train.loss.mat) pander::pandoc.table(resultsList[10]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[9]$train.loss.vec) ggplot(resultsList[9]$validation.loss.vec) ggplot(resultsList[10]$train.loss.vec) ggplot(resultsList[10]$validation.loss.vec)
What are the optimal regularization parameters?
# selected
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.