# mixture.example: Mixture Example In ElemStatLearn: Data Sets, Functions and Examples from the Book: "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" by Trevor Hastie, Robert Tibshirani and Jerome Friedman

## Description

This is a simulated mixture example with 200 instances and two classes. 100 members in each class.

## Usage

 `1` ```data(mixture.example) ```

## Format

The format is: List of 8 \\$ x : 200 x 2 matrix of training predictors \\$ y : 200 x 2 matrix of class labels, 0==green, 1==red \\$ xnew : matrix [1:6831, 1:2] -2.6 -2.5 -2.4 -2.3 -2.2 ... ..- attr(*, "class")= chr "matrix" ..- attr(*, "dimnames")=List of 2 .. ..\$ : chr [1:6831] "1" "2" "3" "4" ... .. ..\$ : chr [1:2] "x1" "x2" : matrix 6831 x 2 of lattice points in predictor space \\$ prob : atomic [1:6831] 3.55e-05 3.05e-05 2.63e-05 2.27e-05 1.96e-05 ... ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ... vector of 6831 probabilities (of class RED) at each lattice point \\$ marginal: atomic [1:6831] 6.65e-15 2.31e-14 7.62e-14 2.39e-13 7.15e-13 ... ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ... : marginal probability at each lattice point \\$ px1 : 69 lattice coordinates for x.1 \\$ px2 : 99 lattice values for x.2 (69*99=6831) \\$ means : num [1:20, 1:2] : 20 x 2 matrix of the mixture centers, first ten for one class, next ten for the other

## Examples

 ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124``` ```str(mixture.example) if(interactive())par(ask=TRUE) x <- mixture.example\$x g <- mixture.example\$y x.mod <- lm( g ~ x) # Figure 2.1: plot(x, col=ifelse(g==1,"red", "green"), xlab="x1", ylab="x2") coef(x.mod) abline( (0.5-coef(x.mod)[1])/coef(x.mod)[3], -coef(x.mod)[2]/coef(x.mod)[3]) ghat <- ifelse( fitted(x.mod)>0.5, 1, 0) length(ghat) sum(ghat == g) 1 - sum(ghat==g)/length(g) #[1] 0.27 # Training misclassification rate xnew <- mixture.example\$xnew dim(xnew) colnames(xnew) library(class) mod15 <- knn(x, xnew, g, k=15, prob=TRUE) summary(mod15) #Figure 2.2: plot(x, col=ifelse(g==1,"red", "green"),xlab="x1", ylab="x2") str(mod15) prob <- attr(mod15, "prob") prob <- ifelse( mod15=="1", prob, 1-prob) # prob is voting fraction for winning class! # Now it is voting fraction for red==1 px1 <- mixture.example\$px1 px2 <- mixture.example\$px2 prob15 <- matrix(prob, length(px1), length(px2)) contour(px1, px2, prob15, levels=0.5, labels="", xlab="x1", ylab="x2", main= "15-nearest neighbour") # adding the points to the plot: points(x, col=ifelse(g==1, "red", "green")) ghat15 <- ifelse(knn(x,x,k=15, cl=g)=="1", 1, 0) sum(ghat15==g) # [1] 169 1 - sum(ghat15==g)/length(g) # [1] 0.155 # Misclassification rate for knn(, k=15) # Then we want the plot for knn with k=1: (Figure 2.3) mod1 <- knn(x, xnew, k=1, cl=g, prob=TRUE) prob <- attr(mod1, "prob") prob <- ifelse( mod1=="1", prob, 1-prob) # prob now is voting # fraction for "red" prob1 <- matrix(prob, length(px1), length(px2) ) contour(px1, px2, prob1, level=0.5, labels="", xlab="x1", ylab="x2", main= "1-nearest neighbour") # Adding the points to the plot: points(x, col=ifelse(g==1, "red", "green")) # Reproducing figure 2.4, page 17 of the book: # The data do not contain a test sample, so we make one, # using the description of the oracle page 17 of the book: The centers # is in the means component of mixture.example, with green(0) first, # so red(1). For a test sample of size 10000 we simulate # 5000 observations of each class. library(MASS) set.seed(123) centers <- c(sample(1:10, 5000, replace=TRUE), sample(11:20, 5000, replace=TRUE)) means <- mixture.example\$means means <- means[centers, ] mix.test <- mvrnorm(10000, c(0,0), 0.2*diag(2)) mix.test <- mix.test + means cltest <- c(rep(0, 5000), rep(1, 5000)) ks <- c(1,3,5,7,9,11,15,17,23,25,35,45,55,83,101,151 ) # nearest neighbours to try nks <- length(ks) misclass.train <- numeric(length=nks) misclass.test <- numeric(length=nks) names(misclass.train) <- names(misclass.test) <- ks for (i in seq(along=ks)) { mod.train <- knn(x,x,k=ks[i],cl=g) mod.test <- knn(x, mix.test,k= ks[i],cl= g) misclass.train[i] <- 1 - sum(mod.train==factor(g))/200 misclass.test[i] <- 1 - sum(mod.test==factor(cltest))/10000 } print(cbind(misclass.train, misclass.test)) # Using package mclust02 # Note that this package is no longer on CRAN, # but must be searched in the archives. ## Not run: if(require(mclust02)){ x <- mixture.example\$x g <- mixture.example\$y xnew <- mixture.example\$xnew px1 <- mixture.example\$px1 px2 <- mixture.example\$px2 mix.mclust <- mclustDA(x, g, xnew, G=1:6, verbose=TRUE) mix.mclust } # end require (mclust02) ## End(Not run) # end \dontrun # Figure 2.4 plot(misclass.train,xlab="Number of NN",ylab="Test error",type="n",xaxt="n") axis(1, 1:length(ks), as.character(ks)) lines(misclass.test,type="b",col='blue',pch=20) lines(misclass.train,type="b",col='red',pch=20) legend("bottomright",lty=1,col=c("red","blue"),legend = c("train ", "test ")) #Figure 2.5 prob<-mixture.example\$prob prob.bayes <- matrix(prob, length(px1), length(px2)) contour(px1, px2, prob.bayes, levels=0.5, labels="", xlab="x1", ylab="x2", main="Bayes decision boundary") points(x, col=ifelse(g==1, "red", "green")) ```

ElemStatLearn documentation built on May 30, 2017, 3:36 a.m.