Nothing
###
# gensemble
###
setClass("gensemble",
representation(
abm="AbstractModel",
dclass="logical",
nlev="numeric",
ylevels="vector",
mods="list",
nmods="numeric",
bagmat="matrix",
oobpred="ANY", #a vector or a matrix
oobpredmat="matrix",
accmat="matrix",
test_oobpred="ANY",
test_oobpredmat="matrix",
test_accmat="matrix"
),
)
###
# print()
# pretty minimalist, needs more detail
###
print.gensemble <- function(x, ...)
{
lb <- x
if (!is.null(lb@abm)) {
cat(paste('Model Type:', lb@abm@model), '\n')
} else {
cat('gensemble: no model defined\n')
return
}
if (lb@nmods > 0) {
cat(sprintf('%d models built', length(lb@mods)), '\n')
}
}
###
# returns the proportions of each class in a given vector
###
lb.class.prop <- function(rowX, labs)
{
p <- c()
for (lev in labs) {
p <- c(p, length(rowX[rowX==lev])/length(rowX))
}
return(t(p))
}
###
# the predict() function
###
predict.gensemble <- function(object, X, type=c("prob", "class"), method=c("prob", "vote"), return.all=F, ...)
{
LB <- object
abm <- LB@abm
mods <- LB@mods
dclass <- LB@dclass
if (dclass) {
#classification
nlev <- LB@nlev
labs <- LB@ylevels
method <- match.arg(method, c("prob", "vote"))
type <- match.arg(type, c("prob", "class"))
pred <- matrix(0, nrow(X), nlev)
if (method == 'vote')
predmat <- matrix(0, nrow(X), length(mods))
} else {
#regression
pred <- rep(0, nrow(X))
if (return.all)
predmat <- matrix(0, nrow(X), length(mods))
}
for (i in 1:length(mods)) {
newpred <- ab.predict(abm, mods[[i]], X)
if (dclass && method == 'vote') {
predmat[,i] <- apply(newpred, 1, class.or.na, labels=labs)
} else {
pred <- pred + newpred
if (return.all)
predmat[,i] <- newpred
}
}
if (dclass && method == 'vote') {
#get the prop of votes for that level
pred <- apply(predmat, 1, lb.class.prop, labs=labs)
pred <- t(pred)
} else {
pred <- pred / length(mods)
}
if (dclass && type == 'class') {
return(as.factor(apply(pred, 1, class.or.na, labs)))
} else {
if (!dclass && return.all)
return(list(pred, predmat))
return(pred)
}
}
###
# linbag(model, X, Y, ...)
# todo: check weights
###
gensemble <- function(abm, X, Y, sampsize=NULL, sampsize_prop=FALSE, nmods=100, perturb_val=0.1, Xtest=NULL, Ytest=NULL, do.trace=TRUE, stepsize=10)
{
#if we were passed a test set, do some sanity checking
if (!is.null(Xtest)) {
if (nrow(Xtest) != length(Ytest))
stop("test data set size mismatch")
testset <- TRUE
} else {
testset <- FALSE
}
#if we have a factor in Y, we are doing classification and keep a different set of tracking metrics
if (is.factor(Y)) {
dclass <- TRUE
nlev <- nlevels(Y)
#per iteration summary function
#this is tied to what mk_oobpred_storage returns as well
sfunc <- cls_iter_summary
} else {
#doing regression
dclass <- FALSE
nlev <- 1
sfunc <- reg_iter_summary
}
#bagging matrix, keeps track of which obs used in which iteration
bagmat <- matrix(FALSE, nrow(X), nmods)
#list of models generated
modlist <- list(NULL)
#Out of bag prediction matrix. Used to store the predictions of the model from each iteration
#used to build the out of bag predictions
tmp <- mk_oobpred_storage(Y, nmods)
oobpredmat <- tmp[[1]]
#our current best prediction set
oobpred <- tmp[[2]]
#tracking the accuracy per iteration
accmat <- tmp[[3]]
#if we were passed a test set we will keep track of how it goes as well
if (testset) {
tmp <- mk_oobpred_storage(Ytest, nmods)
test_oobpredmat <- tmp[[1]]
test_oobpred <- tmp[[2]]
test_accmat <- tmp[[3]]
}
#generate sample sizes for each iteration, this will handle null sampsize
#and try to come up with something sensible for classification and regression
sizes <- mksampsize(Y, sampsize, sampsize_prop)
for (i in 1:nmods) {
#generate the indexes to use for this iteration based on sample size(s)
bagsamp <- bagsamp_idx(Y, sizes, rep=T)
bagmat[bagsamp, i] <- TRUE
Yout <- Y[bagsamp]
Yout <- perturb(Yout, perturb_val)
sink("modelouts.txt")
modlist[[i]] <- ab.model(abm, X[bagsamp,], Yout)
sink()
#which cols in the oob prediction matrix to update
starti <- (i - 1) * nlev + 1
endi <- starti + nlev - 1
#set the col(s) to 0
oobpredmat[,starti:endi] <- rep(NA, length(Y))
#run the new model on the out of bag data
oobpredmat[-bagsamp, starti:endi] <- ab.predict(abm, modlist[[i]], X[-bagsamp,])
#see how the model does on the test set
if (testset)
test_oobpredmat[,starti:endi] <- ab.predict(abm, modlist[[i]], Xtest)
#update oobpred with the mean values for all model results stored in oobpredmat
if (dclass) {
#get the columns of the prediction matrix for a given level
#aggregate the predictions in oobpredmat using the agg_oobpred function
#store this in oobpred
for (lev in levels(Y)) {
oobpred[,levcols(oobpred,lev)] <- apply(oobpredmat[,levcols(oobpredmat, lev)], 1, agg_oobpred)
if (testset)
test_oobpred[,levcols(test_oobpred, lev)] <- apply(test_oobpredmat[,levcols(test_oobpredmat, lev)], 1, agg_oobpred)
}
} else {
oobpred <- apply(oobpredmat, 1, agg_oobpred)
if (testset)
test_oobpred <- apply(test_oobpredmat, 1, agg_oobpred)
}
# cat(oobpredmat[,starti:endi], '\n')
# cat(oobpredmat, '\n')
# cat(oobpred, '\n')
#book keeping time
accmat[i, ] <- sfunc(oobpred, Y)
if (testset)
test_accmat[i, ] <- sfunc(test_oobpred, Ytest)
#print status
if(do.trace && i %% stepsize == 0) {
dimnm <- dimnames(accmat)
if (i == stepsize) {
cat(formatC(dimnm[[2]], format="s", width=10))
if (testset) {
testnames <- paste("test", dimnm[[2]], sep=".")
testwidth <- max(nchar(testnames))
cat(' ', formatC(testnames, format="s", width=testwidth))
}
cat('\n')
}
pvec <- formatC(as.vector(accmat[i,]), width=10, digits=4, format="f")
if (testset)
pvec <- c(pvec, formatC(as.vector(test_accmat[i,]), width=testwidth, digits=4, format="f"))
cat(pvec, "\n")
}
}
#create a new linboost object
lb <- new("gensemble", abm=abm)
if (dclass) {
lb@dclass <- TRUE
lb@nlev <- nlev
lb@ylevels <- levels(Y)
} else {
lb@dclass <- FALSE
}
lb@mods <- modlist
lb@nmods <- length(modlist)
lb@bagmat <- bagmat
lb@oobpred <- oobpred
lb@oobpredmat <- oobpredmat
lb@accmat <- accmat
if (testset) {
lb@test_oobpred <- test_oobpred
lb@test_oobpredmat <- test_oobpredmat
lb@test_accmat <- test_accmat
}
return(lb)
###
# old return, delete this before release
###
# if (testset) {
# LB <- list(modlist, bagmat, oobpred, oobpredmat, accmat, test_oobpred, test_oobpredmat, test_accmat)
# } else {
# LB <- list(modlist, bagmat, oobpred, oobpredmat, accmat)
# }
# return(LB)
}
###
# the summary information collected for each iteration of regression
# changes to this should also change mk_oobpred_storage
###
reg_iter_summary <- function(yhat, y)
{
#for regression, we want the mse, variance, mse scaled by variance
vr <- var(yhat, na.rm=T)
mse <- calc_mse(yhat, y)
scaled_mse <- mse / vr
psuedo_r2 <- cor(y, yhat, use="complete.obs")
na <- sum(is.na(yhat))/length(y)
c(vr, mse, scaled_mse, psuedo_r2, na)
}
###
# the summary information collected for each iteration of classification
# changes to this should also change mk_oobpred_storage
###
cls_iter_summary <- function(yhat, y)
{
#XXX TODO: fix these names, should be class.hit etc
s <- vector("numeric", length=0)
labs <- apply(yhat, 1, class.or.na, labels=levels(y))
#per class accuracy
for (lev in levels(y)) {
s <- c(s,pred_error(y, labs, lev)["hit"])
}
#proportion that have not yet had classification attempted
s <- c(s, sum(is.na(labs)) / length(labs))
#total accuracy
s <- c(s, pred_error(y, labs)["hit"])
return(s)
}
###
# creates storage for the internal operations of linboost
# oobpredmat keeps track of what was predicted for each iteration
# oobpred is the running "best" prediction updated every iteration
# accmat stores information about accuracy of each iteration
#
# the specific columns differ for regression and classfication, so
# are generated by reg_iter_summary and cls_iter_summary respectively
###
mk_oobpred_storage <- function(Y, nmods)
{
if (!is.factor(Y)) {
oobpredmat <- matrix(NA, length(Y), nmods)
oobpred <- rep(0, length(Y))
accmat <- matrix(0, nmods, 5)
colnames(accmat) <- c("var", "mse", "scaled.mse", "pseudo R^2", "obs.na")
} else {
#track accuracy over iterations, +2 to track unlabelled and overall accuracy
nlev <- length(levels(Y))
#for classification, we store a column for each level defined
oobpredmat <- matrix(NA, length(Y), nlev * nmods)
colnames(oobpredmat) <- rep(levels(Y), nmods)
#the out of bag prediction
oobpred <- matrix(0, length(Y), nlev)
colnames(oobpred) <- levels(Y)
accmat <- matrix(0, nmods, nlev+2)
colnames(accmat) <- c(levels(Y), "na", "total")
}
return(list(oobpredmat, oobpred, accmat))
}
###
# come up with some meaningful intepretation of the sample size
# provided to linboost. this is used when creating the bagged sample.
# if proporption is TRUE, all sample sizes provided are assumed to be
# proportions and should range between 0 and 1
#
# for regression if the sample size is null, it is assumed to be the length
# of the input data.
#
# if the given Y value is not a factor, regression is assumed, both here and
# in linboost
#
# for classification, we want a list keyed by each level with the value being
# the sample size for that class.
#
# if sampsize is null, it will build a list of the levels with the count for
# each level
#
# if it is a list, and proportion is false, it will check the provided levels
# match what is available in Y, and return the sample values passed in with no
# modification
#
# if it is a list, and proportion is true, it will set the value for each level
# to be the proportion specifed. So if Level A has n 100 and sampsize[["A"]] is 0.7
# the returned list will be sampesize[["A"]] = 70
#
# if it is a vector, it must either have 1 element or an element for each level.
#
# if there is 1 element, it is assumed to be the sample size for each level.
#
# If proportion is TRUE, the sample size will be generated based on the count
# of each level in Y. E.g. if Y has 100 of level A and 50 of level B, sampsize
# is 0.7 and proportion is TRUE, the output would be list(A=70, B=35)
#
# If there is 1 element and proportion is false, it will be the sample size for
# each level.
#
# If it is a vector of length greater than 1, it is assumed each item corresponds
# the the order of levels in Y as returned by levels(). If you want to guarantee
# the level names match the right size, pass a list of the level names and the
# sizes you want. (see above for details on this)
#
# note the actual sampling is done by bagsamp_idx, which requires a valid
# sampsize argument. use this function to generate the valid sampsize argument
mksampsize <- function(Y, sampsize=NULL, proportion=FALSE)
{
#for regression it is either specified or we use the length of the input
if (!is.factor(Y)) {
if (is.null(sampsize)) {
sz <-round(length(Y) * 0.8)
} else {
if (proportion) {
sz <- round(length(Y) * sampsize)
} else {
sz <- sampsize
}
}
return(sz)
}
#for classification, we use a list with a sample size for each class
sz <- list()
if (is.null(sampsize)) {
#no sample size given so use the counts per level
for (lev in levels(Y)) {
sz[[lev]] <- round(length(which(Y == lev) * 0.8))
}
} else if (is.list(sampsize)) {
#a list of sample sizes per class
#check the names match
for (nm in names(sampsize)) {
if (! nm %in% levels(Y)) {
stop("unknown level name in sampsize")
}
}
if (proportion) {
for (lev in levels(Y)) {
sz[[lev]] <- round(length(which(Y == lev)) * sampsize[lev])
}
} else {
sz <- sampsize
}
} else if (is.vector(sampsize)) {
#a vector of sample sizes given, assume it matches the order of levels(Y)
#if we were passed a single value, assume it is the same value for each level
if (length(sampsize) == 1) {
sampsize <- rep(sampsize, length(levels(Y)))
}
if (length(sampsize) != length(levels(Y))) {
stop("sample size vector does not match number of levels")
}
for (i in 1:length(sampsize)) {
lev <- levels(Y)[i]
if (proportion) {
sz[[lev]] <- round(length(which(Y == lev)) * sampsize[i])
} else {
sz[[lev]] <- sampsize[i]
}
}
} else {
stop("dont know what to do with sampsize")
}
return(sz)
}
###
# sample the given Y values based on the sampsize argument
#
# for regression, sampsize should be the length of the sample required.
#
# for classification, it should be a list keyed by each class in Y with
# a corresponding value that is the desired sample size for that class.
#
# see mksampsize for details the sampsize argument
#
# XXX todo weights.
###
bagsamp_idx <- function(Y, sampsize, weights=NULL, rep=T)
{
#assumes sampsize has been set up correctly via mksampsize
if (!is.factor(Y)) {
return(sample(1:length(Y), sampsize, replace=rep, prob=weights))
}
#need to check how weights work, probably a weight per class
bagsamp <- c()
for (lev in levels(Y)) {
idx <- which(Y == lev)
sz <- sampsize[[lev]]
bagsamp <- c(bagsamp, sample(idx, sz, replace=rep, prob=weights))
}
return(bagsamp)
}
###
# perturb the inputs
###
perturb <- function(y, prop=0) {
if (prop == 0L){
#noop
return(y)
} else if (prop > 1) {
stop("perturb proportion should be between 0 and 1")
}
#how many items in y to perturb
plen <- round(length(y) * prop)
#inidices to perturb
pvec <- sample(1:length(y), plen)
#do the peturb
y[pvec] <- y[pvec][order(rnorm(plen))]
return(y)
}
###
# calculate the prediction error for classification
# yt is the true y values, yhat are the predicted values
# pass level to check for a given level accuracy only.
##
# xxx swap yt and yhat args to be consistent w/other funcs
###
pred_error <- function(yt, yhat, level=NULL) {
if (is.null(level)) {
#look at the whole set
y.idx <- 1:length(yt)
} else {
#look at a specific level
y.idx <- which(yt == level)
}
if (length(y.idx) == 0) {
return(NA)
}
# hit = proportion of correct classification
hit <- sum(na.omit(yhat[y.idx] == yt[y.idx])) / length(na.omit(yhat[y.idx]))
# miss = proportion of incorrect classification
miss <- sum(na.omit(yhat[y.idx] != yt[y.idx])) / length(na.omit(yhat[y.idx]))
#stopifnot(miss == 1 - hit)
c(hit=hit, miss=miss)
}
###
# calculate the mean squared error
###
calc_mse <- function(yhat, y) {
return(sum((yhat - y)^2, na.rm=T)/ length(y))
}
###
# return the most likely class based on the x values
# returns NA if NaN's exist in X
###
class.or.na <- function(X, labels) {
if (is.nan(sum(X))) {
return(NA)
}
return(labels[which.max(X)])
}
###
#take the mean of all values, skipping NA's
###
agg_oobpred <- function(X) {
mean(X, na.rm=T)
}
###
# return the column indexes that match the given level. used when
# aggregating prediction summaries.
###
levcols <- function(mat, level) {
which(colnames(mat) == level)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.