Alternate interface to various machine learning algorithms

Share:

Description

In order to provide a unified (formula-based) interface to various machine learning algorithms, these function wrap a common UI around a couple of existing code.

Usage

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
mlearning(formula, data, method, model.args, call = match.call(), ...,
    subset, na.action = na.fail)
## S3 method for class 'mlearning'
print(x, ...)
## S3 method for class 'mlearning'
summary(object, ...)
## S3 method for class 'summary.mlearning'
print(x, ...)
## S3 method for class 'mlearning'
plot(x, y, ...)
## S3 method for class 'mlearning'
predict(object, newdata, type = c("class", "membership", "both"),
    method = c("direct", "cv"), na.action = na.exclude, ...)
    
cvpredict(object, ...)
## S3 method for class 'mlearning'
cvpredict(object, type = c("class", "membership", "both"),
    cv.k = 10, cv.strat = TRUE, ...)

mlLda(...)
## Default S3 method:
mlLda(train, response, ...)
## S3 method for class 'formula'
mlLda(formula, data, ..., subset, na.action)
## S3 method for class 'mlLda'
predict(object, newdata, type = c("class", "membership", "both",
    "projection"), prior = object$prior, dimension,
    method = c("plug-in", "predictive", "debiased", "cv"), ...)

mlQda(...)
## Default S3 method:
mlQda(train, response, ...)
## S3 method for class 'formula'
mlQda(formula, data, ..., subset, na.action)
## S3 method for class 'mlQda'
predict(object, newdata, type = c("class", "membership", "both"),
    prior = object$prior, method = c("plug-in", "predictive", "debiased",
    "looCV", "cv"), ...)

mlRforest(...)
## Default S3 method:
mlRforest(train, response, ntree = 500, mtry, replace = TRUE, classwt = NULL, ...)
## S3 method for class 'formula'
mlRforest(formula, data, ntree = 500, mtry, replace = TRUE, classwt = NULL, ...,
    subset, na.action)
## S3 method for class 'mlRforest'
predict(object, newdata, type = c("class", "membership", "both",
    "vote"), method = c("direct", "oob", "cv"), ...)

mlNnet(...)
## Default S3 method:
mlNnet(train, response, size = NULL, rang = NULL, decay = 0, maxit = 1000, ...)
## S3 method for class 'formula'
mlNnet(formula, data, size = NULL, rang = NULL, decay = 0, maxit = 1000, ...,
    subset, na.action)

mlLvq(...)
## Default S3 method:
mlLvq(train, response, k.nn = 5, size, prior, algorithm = "olvq1", ...)
## S3 method for class 'formula'
mlLvq(formula, data, k.nn = 5, size, prior, algorithm = "olvq1", ...,
    subset, na.action)
## S3 method for class 'lvq'
summary(object, ...)
## S3 method for class 'summary.lvq'
print(x, ...)
## S3 method for class 'mlLvq'
predict(object, newdata, type = "class", method = c("direct", "cv"), 
    na.action = na.exclude,...)

mlSvm(...)
## Default S3 method:
mlSvm(train, response, scale = TRUE, type = NULL, kernel = "radial",
    classwt = NULL, ...)
## S3 method for class 'formula'
mlSvm(formula, data, scale = TRUE, type = NULL, kernel = "radial",
    classwt = NULL, ..., subset, na.action)
## S3 method for class 'mlSvm'
predict(object, newdata, type = c("class", "membership", "both"),
    method = c("direct", "cv"), na.action = na.exclude,...)

mlNaiveBayes(...)
## Default S3 method:
mlNaiveBayes(train, response, laplace = 0, ...)
## S3 method for class 'formula'
mlNaiveBayes(formula, data, laplace = 0, ..., subset, na.action)

response(object, ...)
## Default S3 method:
response(object, ...)
train(object, ...)
## Default S3 method:
train(object, ...)

Arguments

formula

a formula with left term being the factor variable to predict (for supervised classification), a vector of numbers (for regression) or nothing (for unsupervised classification) and the right term with the list of independent, predictive variables, separated with a plus sign. If the data frame provided contains only the dependent and independent variables, one can use the class ~ . short version (and that one is strongly encouraged). Variables with minus sign are eliminated and calculations on variables are possible according to usual formula convention (possibly protected by I()).

data

a data.frame to use as a training set.

method

a machine learning method to use. For predict(), it is the prediction method. According to predict.lda in package MASS: determines how the parameter estimation is handled. With "plug-in" (the default) the usual unbiased parameter estimates are used and assumed to be correct. With "debiased" an unbiased estimator of the log posterior probabilities is used, and with "predictive" the parameter estimates are integrated out using a vague prior. With "looCV" the leave-one-out cross-validation fits to the original dataset are computed and returned. If you indicate method = "cv" then cvpredict() is used and you cannot provide newdata in that case.

model.args

arguments for formula modeling with substituted data and subset... Not to be used by the end-user.

call

the function call. Not to be used by the end-user.

...

further arguments passed to the machine learning algorithm or the predict() method. See original algorithm.

subset

index vector with the cases to define the training set in use (this argument must be named, if provided).

na.action

function to specify the action to be taken if NAs are found na.fail, by default. Another option is na.omit, and cases with missing values on any required variable are dropped (this argument must be named, if provided). The default, and most suitable option for predict() methods is na.exclude and rows with NAs in newdata are then, excluded from prediction, but reinjected in final results.

cv.k

k for k-fold cross validation, cf errorest().

cv.strat

is the subsampling stratified or not in cross validation, cf errorest().

x

a mlearning object.

y

another object (depending on the machine learning algorithm, but it is usually not used).

object

one of the mlearning objects.

newdata

a data.frame with same variables as data to use for new predictions.

type

the type of result to get. Usually, "class", which is the default. Depending on the algorithm, other types are also available. membership and both are almost always available too. membership corresponds to posterior probability, raw results, normalized votes, etc., depending on the machine learning algorithm. With both, class and membership are both returned at once in a list. For mlSvm(), it is the type of algorithm to use (see ?svm).

train

a matrix or data frame with predictors.

response

a vector of factor (classification) or numeric (regression), or NULL (unsupervised classification).

prior

prior probabilities of the classes (the proportions in the training set are used by default). For mlLvq(), probabilities to represent classes in the codebook (default values are the proportions in the training set).

dimension

the dimension of the space to be used for prediction.

ntree

the number of trees to generate (use a value large enough to get at least a few predictions for each input row).

mtry

number of variables randomly sampled as candidates at each split.

replace

sample cases with or without replacement?

classwt

priors of the classes. Need not add up to one.

size

number of units in the hidden layer for mlNnet(). Can be zero if there are skip-layer units. If NULL, a reasonalbe default is computed. for mlLvq(), the size of the codebook. Defaults to min(round(0.4*ng*(ng-1 + p/2),0), n) where ng is the number of classes.

rang

initial random weights on [-rang, rang]. Value about 0.5 unless the inputs are large, in which case it should be chosen so that rang * max(|x|) is about 1. If NULL, a reasonalbe default is computed.

decay

parameter for weight decay. Default 0.

maxit

maximum number of iterations. Default 1000.

k.nn

k used for k-NN test of correct classification. Default is 5.

algorithm

an algorithm among 'olvq1' (default, the optimized lvq1), 'lvq1', 'lvq2', or 'lvq3'.

scale

are all the variables scaled? If a vector is provided, it is applied to variables with recycling.

kernel

the kernel used by svm, see ?svm. Can be "radial", "linear", "polynomial" or "sigmoid".

laplace

positive double controlling Laplace smoothing for the naive Bayes classifier. The default (0) disables Laplace smoothing.

Details

TODO: explain here the mechanism used to provide a common interface on top of various existing algorithms, and how one can add new items.

Value

A machine learning object where the predict() method can be applied to classify new items.

For response() and train(), the respective resmonse vector and training matrix (the matrix with all predicting terms).

Author(s)

All these functions are just wrapper around existing R code written by Philippe Grosjean <Philippe.Grosjean@umons.ac.be> in order to get similar interface and objects. All credits to original authors (click here under).

See Also

confusion, errorest, lda, qda, randomForest, olvq1, nnet, naiveBayes, svm

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
## Prepare data: split into training set (2/3) and test set (1/3)
data("iris", package = "datasets")
train <- c(1:34, 51:83, 101:133)
irisTrain <- iris[train, ]
irisTest <- iris[-train, ]
## One case with missing data in train set, and another case in test set
irisTrain[1, 1] <- NA
irisTest[25, 2] <- NA

data("HouseVotes84", package = "mlbench")

data(airquality, package = "datasets")

## Supervised classification using linear discriminant analysis
irLda <- mlLda(Species ~ ., data = irisTrain)
irLda
summary(irLda)
plot(irLda, col = as.numeric(response(irLda)) + 1)
predict(irLda, newdata = irisTest) # class (default type)
predict(irLda, type = "membership") # posterior probability
predict(irLda, type = "both") # both class and membership in a list
## Sometimes, other types are allowed, like for lda:
predict(irLda, type = "projection") # Projection on the LD axes
## Add test set items to the previous plot
points(predict(irLda, newdata = irisTest, type = "projection"),
    col = as.numeric(predict(irLda, newdata = irisTest)) + 1, pch = 19)

## predict() and confusion() should be used on a separate test set
## for unbiased estimation (or using cross-validation, bootstrap, ...)
confusion(irLda) # Wrong, cf. biased estimation (so-called, self-consistency)
## Estimation using a separate test set
confusion(predict(irLda, newdata = irisTest), irisTest$Species)

## Another dataset (binary predictor... not optimal for lda, just for test)
summary(res <- mlLda(Class ~ ., data = HouseVotes84, na.action = na.omit))
confusion(res) # Self-consistency
print(confusion(res), error.col = FALSE) # Without error column

## More complex formulas
summary(mlLda(Species ~ . - Sepal.Width, data = iris)) # Exclude variable
summary(mlLda(Species ~ log(Petal.Length) + log(Petal.Width) +
    I(Petal.Length/Sepal.Length), data = iris)) # With calculations

## Factor levels with missing items are allowed
ir2 <- iris[-(51:100), ] # No Iris versicolor in the training set
summary(res <- mlLda(Species ~ ., data = ir2)) # virginica is NOT there
## Missing levels are reinjected in class or membership by predict()
predict(res, type = "both")
## ... but, of course, the classifier is wrong for Iris versicolor
confusion(predict(res, newdata = iris), iris$Species)

## Simpler interface, but more memory-effective
summary(mlLda(train = iris[, -5], response = iris$Species))


## Supervised classification using quadratic discriminant analysis
summary(res <- mlQda(Species ~ ., data = irisTrain))
confusion(res) # Self-consistency
confusion(predict(res, newdata = irisTest), irisTest$Species) # Performances

## Another dataset (binary predictor... not optimal for qda, just for test)
summary(res <- mlQda(Class ~ ., data = HouseVotes84, na.action = na.omit))
confusion(res) # Self-consistency


## Supervised classification using random forest
summary(res <- mlRforest(Species ~ ., data = irisTrain))
plot(res)
## For such a relatively simple case, 50 trees are enough
summary(res <- mlRforest(Species ~ ., data = irisTrain, ntree = 50))
predict(res) # Default type is class
predict(res, type = "membership")
predict(res, type = "both")
predict(res, type = "vote")
## Out-of-bag prediction
predict(res, method = "oob")
confusion(res) # Self-consistency
confusion(res, method = "oob") # Out-of-bag performances
## Cross-validation prediction is a good choice when there is no test set:
predict(res, method = "cv")  # Idem: cvpredict(res)
confusion(res, method = "cv") # Cross-validation for performances estimation
## Evaluation of performances using a separate test set
confusion(predict(res, newdata = irisTest), irisTest$Species) # Test set perfs

## Regression using random forest (from ?randomForest)
set.seed(131)
summary(ozone.rf <- mlRforest(Ozone ~ ., data = airquality, mtry = 3,
    importance = TRUE, na.action = na.omit))
## Show "importance" of variables: higher value mean more important:
round(randomForest::importance(ozone.rf), 2)
plot(na.omit(airquality)$Ozone, predict(ozone.rf))
abline(a = 0, b = 1)

## Unsupervised classification using random forest (from ?randomForest)
set.seed(17)
summary(iris.urf <- mlRforest(~ ., iris[, -5]))
randomForest::MDSplot(iris.urf, iris$Species)
plot(hclust(as.dist(1 - iris.urf$proximity), method = "average"),
    labels = iris$Species)


## Supervised classification using neural network
set.seed(689)
summary(res <- mlNnet(Species ~ ., data = irisTrain))
predict(res) # Default type is class
predict(res, type = "membership")
predict(res, type = "both")
confusion(res) # Self-consistency
confusion(predict(res, newdata = irisTest), irisTest$Species) # Test set perfs

## Idem, but two classes prediction using factor predictors
set.seed(325)
summary(res <- mlNnet(Class ~ ., data = HouseVotes84, na.action = na.omit))
confusion(res) # Self-consistency

## Regression using neural network
set.seed(34)
summary(ozone.nnet <- mlNnet(Ozone ~ ., data = airquality, na.action = na.omit,
    skip = TRUE, decay = 1e-3, size = 20, linout = TRUE))
plot(na.omit(airquality)$Ozone, predict(ozone.nnet))
abline(a = 0, b = 1)


## Supervised classification using learning vector quantization
summary(res <- mlLvq(Species ~ ., data = irisTrain))
predict(res) # This object only returns class
confusion(res) # Self-consistency
confusion(predict(res, newdata = irisTest), irisTest$Species) # Test set perfs


## Supervised classification using support vector machine
summary(res <- mlSvm(Species ~ ., data = irisTrain))
predict(res) # Default type is class
predict(res, type = "membership")
predict(res, type = "both")
confusion(res) # Self-consistency
confusion(predict(res, newdata = irisTest), irisTest$Species) # Test set perfs

## Another dataset
summary(res <- mlSvm(Class ~ ., data = HouseVotes84, na.action = na.omit))
confusion(res) # Self-consistency

## Regression using support vector machine
summary(ozone.svm <- mlSvm(Ozone ~ ., data = airquality, na.action = na.omit))
plot(na.omit(airquality)$Ozone, predict(ozone.svm))
abline(a = 0, b = 1)


## Supervised classification using naive Bayes
summary(res <- mlNaiveBayes(Species ~ ., data = irisTrain))
predict(res) # Default type is class
predict(res, type = "membership")
predict(res, type = "both")
confusion(res) # Self-consistency
confusion(predict(res, newdata = irisTest), irisTest$Species) # Test set perfs

## Another dataset
summary(res <- mlNaiveBayes(Class ~ ., data = HouseVotes84, na.action = na.omit))
confusion(res) # Self-consistency