qe-Series Predictive Functions | R Documentation |
Quick access to machine learning methods, with a very simple interface. "Works right out of the box!": Just one call needed to fit, no preliminary setup of model etc. The simplicity also makes the series useful for teaching.
qeLogit(data,yName,holdout=floor(min(1000,0.1*nrow(data))),yesYVal=NULL)
qeLin(data,yName,noBeta0=FALSE,holdout=floor(min(1000,0.1*nrow(data))))
qeKNN(data,yName,k=25,scaleX=TRUE,smoothingFtn=mean,yesYVal=NULL,
expandVars=NULL,expandVals =NULL,holdout=floor(min(1000,0.1*nrow(data))))
qeRF(data,yName,nTree=500,minNodeSize=10,mtry=floor(sqrt(ncol(data)))+1,
holdout=floor(min(1000,0.1*nrow(data))))
qeRFranger(data,yName,nTree=500,minNodeSize=10,
mtry=floor(sqrt(ncol(data)))+1,deweightPars=NULL,
holdout=floor(min(1000,0.1*nrow(data))),yesYVal="")
qeRFgrf(data,yName,nTree=2000,minNodeSize=5,mtry=floor(sqrt(ncol(data)))+1,
ll=FALSE,lambda=0.1,splitCutoff=sqrt(nrow(data)),quantls=NULL,
holdout=floor(min(1000,0.1*nrow(data))))
qeSVM(data,yName,gamma=1.0,cost=1.0,kernel='radial',degree=2,
allDefaults=FALSE,holdout=floor(min(1000,0.1*nrow(data))))
qeGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1,
holdout=floor(min(1000,0.1*nrow(data))))
qeAdaBoost(data, yName, treeDepth = 3, nRounds = 100, rpartControl = NULL,
holdout = floor(min(1000, 0.1 * nrow(data))))
qeLightGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1,
holdout=floor(min(1000,0.1*nrow(data))))
qeNeural(data,yName,hidden=c(100,100),nEpoch=30,
acts=rep("relu",length(hidden)),learnRate=0.001,
conv=NULL,xShape=NULL,
holdout=floor(min(1000,0.1*nrow(data))))
qeLASSO(data,yName,alpha=1,holdout=floor(min(1000,0.1*nrow(data))))
qePolyLin(data,yName,deg=2,maxInteractDeg = deg,
holdout=floor(min(1000,0.1*nrow(data))))
qePolyLog(data,yName,deg=2,maxInteractDeg = deg,
holdout=floor(min(1000,0.1*nrow(data))))
qePCA(data,yName,qeName,opts=NULL,pcaProp,
holdout=floor(min(1000,0.1*nrow(data))))
qeUMAP(data,yName,qeName,opts=NULL,
holdout=floor(min(1000,0.1*nrow(data))),scaleX=FALSE,
nComps=NULL,nNeighbors=NULL)
qeDT(data,yName,alpha=0.05,minsplit=20,minbucket=7,maxdepth=0,mtry=0,
holdout=floor(min(1000,0.1*nrow(data))))
qeFOCI(data,yName,numCores=1,parPlat="none",
yesYLevel=NULL)
qeFOCIrand(data,yName,xSetSize,nXSets)
qeFOCImult(data,yName,numCores=1,
parPlat="none",coalesce='union')
qeLinKNN(data,yName,k=25,scaleX=TRUE,smoothingFtn=mean,
expandVars=NULL,expandVals=NULL,
holdout=floor(min(1000,0.1*nrow(data))))
qePolyLASSO(data,yName,deg=2,maxInteractDeg=deg,alpha=0,
holdout=floor(min(1000,0.1*nrow(data))))
qeROC(dataIn,qeOut,yLevelName)
qeXGBoost(data,yName,nRounds=250,
params=list(eta=0.3,max_depth=6,alpha=0),
holdout=floor(min(1000,0.1*nrow(data))))
qeDeepnet(data,yName,hidden=c(10),activationfun="sigm",
learningrate=0.8,momentum=0.5,learningrate_scale=1,
numepochs=3,batchsize=100,hidden_dropout=0,yesYVal=NULL,
holdout=floor(min(1000,0.1*nrow(data))))
qeRpart(data,yName,minBucket=10,holdout=floor(min(1000,
0.1*nrow(data))))
qeParallel(data,yName,qeFtnName,dataName,opts=NULL,cls=1,
libs=NULL,holdout=NULL)
checkPkgLoaded(pkgName,whereObtain='CRAN')
## S3 method for class 'qeParallel'
predict(object,newx,...)
## S3 method for class 'qeLogit'
predict(object,newx,...)
## S3 method for class 'qeLin'
predict(object,newx,useTrainRow1=TRUE,...)
## S3 method for class 'qeKNN'
predict(object,newx,newxK=1,...)
## S3 method for class 'qeRF'
predict(object,newx,...)
## S3 method for class 'qeRFranger'
predict(object,newx,...)
## S3 method for class 'qeRFgrf'
predict(object,newx,...)
## S3 method for class 'qeSVM'
predict(object,newx,...)
## S3 method for class 'qeGBoost'
predict(object,newx,newNTree=NULL,...)
## S3 method for class 'qeLightGBoost'
predict(object,newx,...)
## S3 method for class 'qeNeural'
predict(object,newx,k=NULL,...)
## S3 method for class 'qeLASSO'
predict(object,newx,...)
## S3 method for class 'qePoly'
predict(object,newx)
## S3 method for class 'qePCA'
predict(object,newx,...)
## S3 method for class 'qeUMAP'
predict(object,newx,...)
## S3 method for class 'qeDeepnet'
predict(object,newx,...)
## S3 method for class 'qeRpart'
predict(object,newx,...)
## S3 method for class 'qeLASSO'
plot(x,...)
## S3 method for class 'qeRF'
plot(x,...)
## S3 method for class 'qeRpart'
plot(x,boxPalette=c("red","yellow","green","blue"),...)
... |
Further arguments. |
cls |
Cluster in the sense of parallel package. If not of
class |
libs |
Character vector listing libraries needed to be loaded for
|
Drop out fraction for hidden layer. | |
batchsize |
Batch size. |
numepochs |
Number of iterations to conduct. |
learningrate |
Learning rate. |
momentum |
Momemtum |
learningrate_scale |
Learning rate will be multiplied by this at each iteration, allowing for decay. |
activationfun |
Can be 'sigm', 'tanh' or 'linear'. |
newNTree |
Number of trees to use in prediction. |
newxK |
If predicting new cases, number of nearest neighbors to
smooth in the object returned by |
useTrainRow1 |
If TRUE, take names in |
newx |
New data to be predicted. |
object |
An object returned by a qe-series function. |
minsplit |
Minimum number of data points in a node. |
minbucket |
Minimum number of data points in a terminal node. |
minBucket |
Minimum number of data points in a terminal node. |
maxdepth |
Maximum number of levels in a tree. |
qeName |
Name of qe-series predictive function. |
qeFtnName |
Name of qe-series predictive function. |
conv |
R list specifying the convolutional layers, if any. |
deweightPars |
Values for de-emphasizing variables in a tree node split, e.g. 'list(age=0.2,gender=0.5)'. |
allDefaults |
Use all default values of the wrapped function. |
expandVars |
Columns to be emphasized. |
expandVals |
Emphasis values; a value less than 1 means de-emphasis. |
mtry |
Number of variables randomly tried at each split. |
yesYVal |
Y value to be considered "yes," to be coded 1 rather than 0. |
yesYLevel |
Y value to be considered "yes," to be coded 1 rather than 0. |
noBeta0 |
No intercept term. |
pcaProp |
Desired proportion of overall variance for the PCs.' |
data |
Dataframe, training set. Classification case is signaled via labels column being an R factor. |
dataIn |
See |
qeOut |
Output from a qe-series function. |
yName |
Name of the class labels column. |
holdout |
If not NULL, form a holdout set of the specified size. After fitting to the remaining data, evaluate accuracy on the test set. |
k |
Number of nearest neighbors. In functions other than
|
smoothingFtn |
As in |
scaleX |
Scale the features. |
nTree |
Number of trees. |
minNodeSize |
Minimum number of data points in a tree node. |
learnRate |
Learning rate. |
Vector of units per hidden layer. Fractional values
indicated dropout proportions. Can be specified as a string, e.g.
'100,50', for use with | |
nEpoch |
Number of iterations in neural net. |
acts |
Vector of names of the activation functions, one per hidden layer. Choices inclde 'relu', 'sigmoid', 'tanh', 'softmax', 'elu', 'selu'. |
alpha |
In the case of |
gamma |
Scale parameter in |
cost |
Cost parameter in |
kernel |
In the case of |
degree |
Degree of SVM polynomial kernel, if any. |
opts |
R list of optional arguments for none, some or all of th
functions in |
nComps |
Number of UMAP components to extract. |
nNeighbors |
Number of nearest neighbors to use in UMAP. |
ll |
If TRUE, use local linear forest. |
lambda |
Ridge lambda for local linear forest. |
splitCutoff |
For leaves smaller than this value, do not fit linear model. Just use the linear model fit to the entire dataset. |
xShape |
Input X data shape, e.g. c(28,28) for 28x28 grayscale
images. Must be non-NULL if |
treeDepth |
Number of levels in each tree. |
nRounds |
Number of boosting rounds. |
rpartControl |
An R list specifying properties of fitted trees. |
numCores |
Number of cores to use in parallel computation. |
parPlat |
Parallel platforParallel platform. Valid values are
'none', 'cluster' (output of |
xSetSize |
Size of subsets of the predictor variables. |
nXSets |
Number of subsets of the predictor variables. |
coalesce |
Method for combining variable sets. |
deg |
Degree of a polynomial. |
maxInteractDeg |
Maximul degree of interaction terms in a polynomial. |
yLevelName |
Name of the class to be considered a positive response in a classification problem. |
params |
Tuning parameters for |
boxPalette |
Color palette. |
pkgName |
Name of wrapped package. |
whereObtain |
Location. |
x |
A qe-series function return object. |
As noted, these functions are intended for quick, first-level analysis of regression/machine learning problems. Emphasis here is on convenience and simplicity.
The idea is that, given a new dataset, the analyst can quickly and easily try fitting a number of models in succession, say first k-NN, then random forests:
# built-in data on major league baseball players > data(mlb) > mlb <- mlb[,3:6] # position, height, weight, age # fit models > knnout <- qeKNN(mlb,'Weight',k=25) > rfout <- qeRF(mlb,'Weight') # mean abs. pred. error on holdout set, in pounds > knnout$testAcc [1] 11.75644 > rfout$testAcc [1] 12.6787 # predict a new case > newx <- data.frame(Position='Catcher',Height=73.5,Age=26) > predict(knnout,newx) [,1] [1,] 204.04 > predict(rfout,newx) 11 199.1714 # many of the functions include algorithm-specific output > lassout <- qeLASSO(mlb,'Weight') holdout set has 101 rows > lassout$testAcc [1] 14.27337 > lassout$coefs # sparse result? 10 x 1 sparse Matrix of class "dgCMatrix" s1 (Intercept) -109.2909416 Position.Catcher 0.4408752 Position.First_Baseman 4.8308437 Position.Outfielder . Position.Relief_Pitcher . Position.Second_Baseman -0.7846501 Position.Shortstop -4.2291338 Position.Starting_Pitcher . Height 4.0039114 Age 0.5352793
The holdout
argument triggers formation of a holdout set
and the corresponding cross-validation evaluation of predictive power.
Note that if a holdout is formed, the return value will consist of the
fit on the training set, not on the full original dataset.
The qe*
functions do model fit. Each of them has a
predict
method, and some also have a plot
method.
Arguments for qe*
are at least:
data
yName
holdout
Typically there are also algorithm-specific hyperparameter arguments.
Arguments for predict
are at least:
object
, the return value from qe*
newx
, a data frame of points to be predicted
For both the fitting function and the prediction function, there may be additional algorithm-specific parameters; default values are provided.
Some notes on specific functions:
The function qeLin
handles not only the usual OLS models
but also classification problems as multivariate-outcome linear
models. If one's goal is prediction, it can be much faster than
qeLogit
, often with comparable accuracy.
Regularization in linear/generalized linear models is
implemented in qeLASSO
and other functions with names
containing 'LASSO', as well as qeNCVregCV
. The latter,
wrappping the MCP and other regularization methods, wraps the package
of the same name.
Several functions fit polynomial models. The qePolyLin
function does polynomial regression of the indicated degree. In the
above example degree 3 means all terms through degree 3, e.g.
Height * Age^2
. Dummy variables are handled properly, e.g.
no powers of a dummy are generatd. The logistic polynomial
regression version is qePolyLog
, and there is a LASSO version,
qePolyLASSO
.
Several random forests implementations are offered:
qeRF
wraps randomForest
in the package of the same name;
qeRFranger
wraps ranger
in the package of the same name;
qeRFgrf
wraps regression_forest
and
ll_regression_forest
in grf (the latter does local
linear smoothing). There is also qeDT
, using
the party package.
Several implementations of gradient boosting are offered,
including qeGBoost
using the gbm package,
qelightGBoost
using lightgbm, and qeXGBoost
wrapping xgboost.
Several functions involve dimension reduction/feature
selection. Pre-mapping to lower-dimensional manifolds can be done via
qePCA
and qeUMAP
. For instance, the former will first
extract the specified number of principal components, then fit the
user's desired ML model, say k-NN (qeKNN
) or gradient boosting
(qeGBoost
).
The qeFOCI
function does feature selection
in a basically assumption-free manner. It handles numeric and binary
Y (the latter coded 1,0). For categorical Y, use qeFOCImult
.
The function qeFOCIrand
applies FOCI to many subsets of the
input dataset, eventually returning the union of the outputs; this is
useful if the dataset has many NA values.
Neural network models are implemented by qeNeural
and qeDeepnet
, based on keras and deepnet.
The qeLinKNN
function offers a hybrid approach. It
first fits a linear model, then applies k-Nearest Neighbors to the
residuals. The qePolyLinKNN
function does the same in with a
polynomial fit.
The qeIso
function is intended mainly for use as a
smoothing method in calibration actions.
In most cases, the full basket of options in the wrapped function is not reflected. Use of arguments not presented in the qe function requires direct use the relevant packages.
The value returned by qe*
functions depends on the algorithm, but
with some commonality, e.g. classif
, a logical value indicating
whether the problem was of classification type.
If a holdout set was requested, an additional returned component will be
testAcc
, the accuracy on the holdout set. This will be Mean
Absolute Prediction Error in the regression case, and proportion of
misclassified cases in the classification case.
The value returned by the predict
functions is an
R list with components as follows:
Classification case:
predClasses
: R factor instance of predicted class labels
probs
: vector/matrix of class probabilities; in the 2-class
case, a vector, the probabilities of Y = 1
Regression case: vector of predicted values
Norm Matloff
# see also 'details' above
## Not run:
data(peFactors)
pef <- peFactors[,c(1,3,5,7:9)]
# most people in the dataset have at least a Bachelor's degree; so let's
# just consider Master's (code 14) and PhD (code 16) as special
pef$educ <- toSubFactor(pef$educ,c('14','16'))
# predict occupation; 6 classes, 100, 101, 102, 106, 140, 141, using SVM
svmout <- qeSVM(pef,'occ',holdout=NULL)
# as example of prediction, take the 8th case, but change the gender and
# age to female and 25; note that by setting k to non-null, we are
# requesting that conditional probabilities be calculated, via
# knnCalib(), here using 25 nearest neighbors
newx <- pef[8,-3]
newx$sex <- '2'
newx$age <- 25
predict(svmout,newx,k=25)
# $predClasses
# 8
# 100
# Levels: 100 101 102 106 140 141
# $dvals
# 102/101 102/100 102/141 102/140 102/106 101/100 101/141
# 8 -0.7774038 -0.5132022 0.9997894 1.003251 0.999688 -0.4023077 1.000419
# 101/140 101/106 100/141 100/140 100/106 141/140 141/106 140/106
# 8 1.000474 0.9997371 1.000088 1.000026 1.000126 0.9460703 -0.4974625 -1.035721
#
# $probs
# 100 101 102 106 140 141
# [1,] 0.24 0.52 0.12 0.08 0 0.04
#
# so, occupation code 100 is predicted, with a 0.36 conditional
# probability
# if holdout evaluation is desired as well, say 1000 cases, seed 9999:
> svmout <- qeSVM(pef,'occ',holdout=c(1000,9999))
> svmout$testAcc
[1] 0.622 # 62
# linear
# lm() doesn't like numeric factor levels, so prepend an 'a'
pef$occ <- prepend('a',pef$occ)
lmout <- qeLin(pef,'occ')
predict(lmout,pef[1,-3]) # occ 100, prob 0.3316
lmout <- qeLin(pef,'wageinc')
predict(lmout,pef[1,-5]) # 70857.79
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.