itle: "Report for Project One: Nearest Neighbors"
author: "Kaitlyn Grubb"
date: "r Sys.Date()
"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Report for Project One: Nearest Neighbors}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
data.name.vec <- c( "spam", "SAheart", "zip.train", "prostate", "ozone") data.list <- list() library(ElemStatLearn) for(data.name in data.name.vec) { data(list = data.name, package="ElemStatLearn") data.list[[data.name]] <- get(data.name) } str(data.list) is.binary <- ElemStatLearn::zip.train[,1] %in% c(0,1) data.list <- list( spam=list( label.vec=ifelse(ElemStatLearn::spam$spam=="spam", 1, 0), feature.mat=as.matrix(ElemStatLearn::spam[, -ncol(ElemStatLearn::spam)])), zip.train=list( label.vec=ElemStatLearn::zip.train[is.binary,1], feature.mat=ElemStatLearn::zip.train[is.binary,-1]) ) n.folds <- 5 resultsList <- list() for(data.name in names(data.list)) { one.data.set <- data.list[[data.name]] set.seed(1) fold.vec <- sample(rep(1:n.folds, l=nrow(one.data.set$feature.mat))) #NearestNeighborsCV(one.data.set$feature.mat, one.data.set$label.vec) for(test.fold in 1:(n.folds)) { is.train <- fold.vec != test.fold fit <- NNLearnCV(one.data.set$feature.mat[is.train], one.data.set$label.vec[is.train]) pred.vec <- fit$predict(one.data.set$feature.mat[!is.train]) ## compute test error of pred vec with respect to one data.set$label[!is.train] # LIST OF LISTS resultsList[data.name] <- NNLearnCV(data.name[ , -1], data.name[ , 1], max.neighbors, fold.vec, n.folds)# save your results } }
#print out and/or plot the matrix. pander::pandoc.table(resultsList[1]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[1]$train.loss.vec) ggplot(resultsList[1]$validation.loss.vec)
What is the optimal number of neighbors?
#plot the two loss functions. resultsList[1]$selected.neighbors
#print out and/or plot the matrix. pander::pandoc.table(resultsList[2]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[2]$train.loss.vec) ggplot(resultsList[2]$validation.loss.vec)
What is the optimal number of neighbors?
#plot the two loss functions. resultsList[2]$selected.neighbors
#print out and/or plot the matrix. pander::pandoc.table(resultsList[3]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[3]$train.loss.vec) ggplot(resultsList[3]$validation.loss.vec)
What is the optimal number of neighbors?
#plot the two loss functions. resultsList[3]$selected.neighbors
#print out and/or plot the matrix. pander::pandoc.table(resultsList[4]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[4]$train.loss.vec) ggplot(resultsList[4]$validation.loss.vec)
What is the optimal number of neighbors?
#plot the two loss functions. resultsList[4]$selected.neighbors
#print out and/or plot the matrix. pander::pandoc.table(resultsList[5]$train.loss.mat)
comment on difference between NN and baseline.
#plot the two loss functions. ggplot(resultsList[5]$train.loss.vec) ggplot(resultsList[5]$validation.loss.vec)
What is the optimal number of neighbors?
#plot the two loss functions. resultsList[5]$selected.neighbors
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.