<<<<<<< HEAD

title: "Vignette Title" author: "Vignette Author" date: "r Sys.Date()" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Vignette Title} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8}


knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
usethis::use_package("codingProject2")
#usethis::use_package("modeest")

data.name.vec <- c(
  "spam",
  "SAheart",
  "zip.train",
  "prostate",
  "ozone")
data.list <- list()
library(ElemStatLearn)
for(data.name in data.name.vec)
{
  data(list = data.name, package="ElemStatLearn")
  data.list[[data.name]] <- get(data.name)
}
str(data.list)
is.binary <- ElemStatLearn::zip.train[,1] %in% c(0,1)
data.list <- list(
  spam=list(
    label.vec=ifelse(ElemStatLearn::spam$spam=="spam", 1, 0),
    feature.mat=as.matrix(ElemStatLearn::spam[, 1:57])),
  SAheart=list(
    label.vec=ifelse(ElemStatLearn::SAheart$SAheart=="chd", 1, 0),
    feature.mat=as.matrix(ElemStatLearn::SAheart[, -ncol(ElemStatLearn::SAheart)])),
  zip.train=list(
    label.vec=ElemStatLearn::zip.train[is.binary,1],
    feature.mat=ElemStatLearn::zip.train[is.binary,-1]),
  prostate=list(
    label.vec=ElemStatLearn::prostate$lpsa,
    feature.mat=as.matrix(ElemStatLearn::prostate[, -c(9,10)])),
  ozone=list(
    label.vec=ElemStatLearn::ozone$ozone,
    feature.mat=as.matrix(ElemStatLearn::ozone[, -ncol(ElemStatLearn::ozone)]))
)

n.folds <- 4
resultsList <- list()
baseline <- list()
mean.loss.list <- list()
#current.set.matrix <-

for(data.name in names(data.list))
{
  one.data.set <- data.list[[data.name]]
  set.seed(1)

  # print("This is one.data.set")
  # print(one.data.set)
  # print(data.list[[data.name]])
  # print(names(data.list))

  #print(one.data.set$feature.mat[1:nrow(one.data.set$feature.mat)])

  #print(one.data.set[1:nrow(one.data.set$feature.mat), -1])

  # variables for function
  X.mat <- one.data.set$feature.mat
  y.vec <- one.data.set$label.vec
  fold.vec <- sample(rep(1:n.folds, l=nrow(X.mat)))
  max.iterations <- 30

  penalty.vec <- c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5)

  result.mat.list <- list()

  for(test.fold in 1:n.folds)
  {
    is.test <- fold.vec == test.fold
    is.train <- !is.test
    X.train <- X.mat[is.train,]
    y.train <- y.vec[is.train]

    fold.vec.train <- sample(rep(1:n.folds, l=nrow(X.train)))

    print(test.fold)
    # print(y.train)
    # print(fold.vec)

    if(data.name == "prostate" || data.name == "ozone")
    {  
       #For each train/test split, to show that your algorithm is actually learning something     non-trivial from          the inputs/features, compute a baseline predictor that ignores the inputs/features.
       #Regression: the mean of the training labels/outputs.

       baseline <- mean(y.train)

       # print(data.name)
       # print(data.list[[data.name]])

       resultES <- codingProject2::LMSquareLossEarlyStoppingCV(X.train,
                                               y.train,
                                               fold.vec.train,
                                               max.iterations)
       #resultREG <- codingProject2::LMSquareLossL2CV(X.train,
       #                                              y.train,
       #                                              fold.vec.train,
       #                                              penalty.vec)

       str(resultES)
       #str(resultREG)

       pred.prob.list[test.fold] <- list(
         earlyStopping=resultES$predict(one.data.set$feature.mat[is.test,]),
         #L2regularized=resultREG$predict(one.data.set$feature.mat[is.test,]),
         baseline=rep(baseline, sum(is.test)))

       print(pred.prob.list[[test.fold]])

         # For each data set, compute a 4 x 3 matrix of mean test loss values:
         # each of the four rows are for a specific test set,
         # the first column is for the early stopping predictor,
         # the second column is for the L2 regularized predictor,
         # the third column is for the baseline/un-informed predictor.

         # mat <- cbind(resultES$penalty.vec, result$penalty.vec)



 #      str(pred.prob.list)
  #     test.fold.result.list <- list()
   #    for(algo in names(pred.prob.list))
    #   {
     #    pred.prob.vec <- pred.prob.list[[algo]]
      #   pred.class.vec <- ifelse(pred.prob.vec > 0.5, 1, 0)
       #  test.fold.result.list[[algo]] <- mean(data.set$labels[is.test] != #pred.class.vec)
 #      }
    }
  }


  # For spam SAheart and train

#  else
#  {

    #For each train/test split, to show that your algorithm is actually learning something non-trivial from the inputs/features, compute a baseline predictor that ignores the inputs/features.
    # Binary classification: the most frequent class/label/output in the training data.

    #prdictionBase <- mfv(fold.vec)
    #baseline.append(mfv(fold.vec))

#    resultES <- LMLogisticLossEarlyStoppingCV(X.mat,
#                                            y.vec,
#                                            fold.vec,
                                            #max.iterations)
#    result <- LMLogisticLossIterations(X.mat,
#                                     y.vec,
#                                     fold.vec,
#                                     max.iterations)

#    resultList.append(resultES)
#    resultList.append(result)


#  }

  # For each data set, compute a 4 x 3 matrix of mean test loss values:
    # each of the four rows are for a specific test set,
    # the first column is for the early stopping predictor,
    # the second column is for the L2 regularized predictor,
    # the third column is for the baseline/un-informed predictor.

  #mat <- cbind(resultES$penalty.vec, result$penalty.vec)

  # List is (LMLogisticLossEarlyStoppingCV, LMLogisticLossIterations for spam SAheart and then train, 
  # Then  LMSquareLossEarlyStoppingCV LMSquareLossIterations for prostate and ozone)

}

Data set 1: spam

Matrix of loss values

#print out and/or plot the matrix of loss values
pander::pandoc.table(resultsList[1]$train.loss.mat)
pander::pandoc.table(resultsList[2]$train.loss.mat)

comment on difference between NN and baseline.

Train/validation loss plot

#plot the two loss functions.
ggplot(resultsList[1]$train.loss.vec)
ggplot(resultsList[1]$validation.loss.vec)

ggplot(resultsList[2]$train.loss.vec)
ggplot(resultsList[2]$validation.loss.vec)

What are the optimal regularization parameters?

# selected 

Data set 2: SAheart

Matrix of loss values

#print out and/or plot the matrix.
pander::pandoc.table(resultsList[3]$train.loss.mat)
pander::pandoc.table(resultsList[4]$train.loss.mat)

comment on difference between NN and baseline.

Train/validation loss plot

#plot the two loss functions.
ggplot(resultsList[3]$train.loss.vec)
ggplot(resultsList[3]$validation.loss.vec)

ggplot(resultsList[4]$train.loss.vec)
ggplot(resultsList[4]$validation.loss.vec)

What are the optimal regularization parameters?

# selected

Data set 3: zip.train

Matrix of loss values

#print out and/or plot the matrix.
pander::pandoc.table(resultsList[5]$train.loss.mat)
pander::pandoc.table(resultsList[6]$train.loss.mat)

comment on difference between NN and baseline.

Train/validation loss plot

#plot the two loss functions.
#plot the two loss functions.
ggplot(resultsList[5]$train.loss.vec)
ggplot(resultsList[5]$validation.loss.vec)

ggplot(resultsList[6]$train.loss.vec)
ggplot(resultsList[6]$validation.loss.vec)

What are the optimal regularization parameters?

# selected

Data set 4: prostate

Matrix of loss values

#print out and/or plot the matrix.
pander::pandoc.table(resultsList[7]$train.loss.mat)
pander::pandoc.table(resultsList[8]$train.loss.mat)

comment on difference between NN and baseline.

Train/validation loss plot

#plot the two loss functions.
ggplot(resultsList[7]$train.loss.vec)
ggplot(resultsList[7]$validation.loss.vec)

ggplot(resultsList[8]$train.loss.vec)
ggplot(resultsList[8]$validation.loss.vec)

What are the optimal regularization parameters?

# selected

Data set 5: ozone

Matrix of loss values

#print out and/or plot the matrix.
pander::pandoc.table(resultsList[9]$train.loss.mat)
pander::pandoc.table(resultsList[10]$train.loss.mat)

comment on difference between NN and baseline.

Train/validation loss plot

#plot the two loss functions.
ggplot(resultsList[9]$train.loss.vec)
ggplot(resultsList[9]$validation.loss.vec)

ggplot(resultsList[10]$train.loss.vec)
ggplot(resultsList[10]$validation.loss.vec)

What are the optimal regularization parameters?

# selected


mertayD/codingProject2 documentation built on May 14, 2019, 11:06 p.m.