function (x, y = NULL, xtest = NULL, ytest = NULL, ntree = 500,
mtry = if (!is.null(y) && !is.factor(y)) max(floor(ncol(x)/3),
1) else floor(sqrt(ncol(x))), replace = TRUE, classwt = NULL,
cutoff, strata, sampsize = if (replace) nrow(x) else ceiling(0.632 *
nrow(x)), nodesize = if (!is.null(y) && !is.factor(y)) 5 else 1,
maxnodes = NULL, importance = FALSE, localImp = FALSE, nPerm = 1,
proximity, oob.prox = proximity, norm.votes = TRUE, do.trace = FALSE,
keep.forest = !is.null(y) && is.null(xtest), corr.bias = FALSE,
keep.inbag = FALSE, ...)
{
addclass <- is.null(y)
classRF <- addclass || is.factor(y)
if (!classRF && length(unique(y)) <= 5) {
warning("The response has five or fewer unique values. Are you sure you want to do regression?")
}
if (classRF && !addclass && length(unique(y)) < 2)
stop("Need at least two classes to do classification.")
n <- nrow(x)
p <- ncol(x)
if (n == 0)
stop("data (x) has 0 rows")
x.row.names <- rownames(x)
x.col.names <- if (is.null(colnames(x)))
1:ncol(x)
else colnames(x)
keep.forest <- keep.forest
testdat <- !is.null(xtest)
if (testdat) {
if (ncol(x) != ncol(xtest))
stop("x and xtest must have same number of columns")
ntest <- nrow(xtest)
xts.row.names <- rownames(xtest)
}
if (mtry < 1 || mtry > p)
warning("invalid mtry: reset to within valid range")
mtry <- max(1, min(p, round(mtry)))
if (!is.null(y)) {
if (length(y) != n)
stop("length of response must be the same as predictors")
addclass <- FALSE
}
else {
if (!addclass)
addclass <- TRUE
y <- factor(c(rep(1, n), rep(2, n)))
x <- rbind(x, x)
}
if (any(is.na(x)))
stop("NA not permitted in predictors")
if (testdat && any(is.na(xtest)))
stop("NA not permitted in xtest")
if (any(is.na(y)))
stop("NA not permitted in response")
if (!is.null(ytest) && any(is.na(ytest)))
stop("NA not permitted in ytest")
if (is.data.frame(x)) {
xlevels <- lapply(x, mylevels)
ncat <- sapply(xlevels, length)
ncat <- ifelse(sapply(x, is.ordered), 1, ncat)
x <- data.matrix(x)
if (testdat) {
if (!is.data.frame(xtest))
stop("xtest must be data frame if x is")
xfactor <- which(sapply(xtest, is.factor))
if (length(xfactor) > 0) {
for (i in xfactor) {
if (any(!levels(xtest[[i]]) %in% xlevels[[i]]))
stop("New factor levels in xtest not present in x")
xtest[[i]] <- factor(xlevels[[i]][match(xtest[[i]],
xlevels[[i]])], levels = xlevels[[i]])
}
}
xtest <- data.matrix(xtest)
}
}
else {
ncat <- rep(1, p)
names(ncat) <- colnames(x)
xlevels <- as.list(rep(0, p))
}
maxcat <- max(ncat)
if (maxcat > 53)
stop("Can not handle categorical predictors with more than 53 categories.")
if (classRF) {
nclass <- length(levels(y))
if (any(table(y) == 0))
stop("Can't have empty classes in y.")
if (!is.null(ytest)) {
if (!is.factor(ytest))
stop("ytest must be a factor")
if (!all(levels(y) == levels(ytest)))
stop("y and ytest must have the same levels")
}
if (missing(cutoff)) {
cutoff <- rep(1/nclass, nclass)
}
else {
if (sum(cutoff) > 1 || sum(cutoff) < 0 || !all(cutoff >
0) || length(cutoff) != nclass) {
stop("Incorrect cutoff specified.")
}
if (!is.null(names(cutoff))) {
if (!all(names(cutoff) %in% levels(y))) {
stop("Wrong name(s) for cutoff")
}
cutoff <- cutoff[levels(y)]
}
}
if (!is.null(classwt)) {
if (length(classwt) != nclass)
stop("length of classwt not equal to number of classes")
if (!is.null(names(classwt))) {
if (!all(names(classwt) %in% levels(y))) {
stop("Wrong name(s) for classwt")
}
classwt <- classwt[levels(y)]
}
if (any(classwt <= 0))
stop("classwt must be positive")
ipi <- 1
}
else {
classwt <- rep(1, nclass)
ipi <- 0
}
}
else addclass <- FALSE
if (missing(proximity))
proximity <- addclass
if (proximity) {
prox <- matrix(0, n, n)
proxts <- if (testdat)
matrix(0, ntest, ntest + n)
else double(1)
}
else {
prox <- proxts <- double(1)
}
if (localImp) {
importance <- TRUE
impmat <- matrix(0, p, n)
}
else impmat <- double(1)
if (importance) {
if (nPerm < 1)
nPerm <- as.integer(1)
else nPerm <- as.integer(nPerm)
if (classRF) {
impout <- matrix(0, p, nclass + 2)
impSD <- matrix(0, p, nclass + 1)
}
else {
impout <- matrix(0, p, 2)
impSD <- double(p)
names(impSD) <- x.col.names
}
}
else {
impout <- double(p)
impSD <- double(1)
}
nsample <- if (addclass)
2 * n
else n
Stratify <- length(sampsize) > 1
if ((!Stratify) && sampsize > nrow(x))
stop("sampsize too large")
if (Stratify && (!classRF))
stop("sampsize should be of length one")
if (classRF) {
if (Stratify) {
if (missing(strata))
strata <- y
if (!is.factor(strata))
strata <- as.factor(strata)
nsum <- sum(sampsize)
if (length(sampsize) > nlevels(strata))
stop("sampsize has too many elements.")
if (any(sampsize <= 0) || nsum == 0)
stop("Bad sampsize specification")
if (!is.null(names(sampsize))) {
sampsize <- sampsize[levels(strata)]
}
if (any(sampsize > table(strata)))
stop("sampsize can not be larger than class frequency")
}
else {
nsum <- sampsize
}
nrnodes <- 2 * trunc(nsum/nodesize) + 1
}
else {
nrnodes <- 2 * trunc(sampsize/max(1, nodesize - 4)) +
1
}
if (!is.null(maxnodes)) {
maxnodes <- 2 * maxnodes - 1
if (maxnodes > nrnodes)
warning("maxnodes exceeds its max value.")
nrnodes <- min(c(nrnodes, max(c(maxnodes, 1))))
}
x <- t(x)
storage.mode(x) <- "double"
if (testdat) {
xtest <- t(xtest)
storage.mode(xtest) <- "double"
if (is.null(ytest)) {
ytest <- labelts <- 0
}
else {
labelts <- TRUE
}
}
else {
xtest <- double(1)
ytest <- double(1)
ntest <- 1
labelts <- FALSE
}
nt <- if (keep.forest)
ntree
else 1
if (classRF) {
cwt <- classwt
threshold <- cutoff
error.test <- if (labelts)
double((nclass + 1) * ntree)
else double(1)
rfout <- .C("classRF", x = x, xdim = as.integer(c(p,
n)), y = as.integer(y), nclass = as.integer(nclass),
ncat = as.integer(ncat), maxcat = as.integer(maxcat),
sampsize = as.integer(sampsize), strata = if (Stratify) as.integer(strata) else integer(1),
Options = as.integer(c(addclass, importance, localImp,
proximity, oob.prox, do.trace, keep.forest, replace,
Stratify, keep.inbag)), ntree = as.integer(ntree),
mtry = as.integer(mtry), ipi = as.integer(ipi), classwt = as.double(cwt),
cutoff = as.double(threshold), nodesize = as.integer(nodesize),
outcl = integer(nsample), counttr = integer(nclass *
nsample), prox = prox, impout = impout, impSD = impSD,
impmat = impmat, nrnodes = as.integer(nrnodes), ndbigtree = integer(ntree),
nodestatus = integer(nt * nrnodes), bestvar = integer(nt *
nrnodes), treemap = integer(nt * 2 * nrnodes),
nodepred = integer(nt * nrnodes), xbestsplit = double(nt *
nrnodes), errtr = double((nclass + 1) * ntree),
testdat = as.integer(testdat), xts = as.double(xtest),
clts = as.integer(ytest), nts = as.integer(ntest),
countts = double(nclass * ntest), outclts = as.integer(numeric(ntest)),
labelts = as.integer(labelts), proxts = proxts, errts = error.test,
inbag = if (keep.inbag) matrix(integer(n * ntree),
n) else integer(n), DUP = FALSE, PACKAGE = "randomForest")[-1]
if (keep.forest) {
max.nodes <- max(rfout$ndbigtree)
treemap <- aperm(array(rfout$treemap, dim = c(2,
nrnodes, ntree)), c(2, 1, 3))[1:max.nodes, ,
, drop = FALSE]
}
if (!addclass) {
out.class <- factor(rfout$outcl, levels = 1:nclass,
labels = levels(y))
names(out.class) <- x.row.names
con <- table(observed = y, predicted = out.class)[levels(y),
levels(y)]
con <- cbind(con, class.error = 1 - diag(con)/rowSums(con))
}
out.votes <- t(matrix(rfout$counttr, nclass, nsample))[1:n,
]
oob.times <- rowSums(out.votes)
if (norm.votes)
out.votes <- t(apply(out.votes, 1, function(x) x/sum(x)))
dimnames(out.votes) <- list(x.row.names, levels(y))
class(out.votes) <- c(class(out.votes), "votes")
if (testdat) {
out.class.ts <- factor(rfout$outclts, levels = 1:nclass,
labels = levels(y))
names(out.class.ts) <- xts.row.names
out.votes.ts <- t(matrix(rfout$countts, nclass, ntest))
dimnames(out.votes.ts) <- list(xts.row.names, levels(y))
if (norm.votes)
out.votes.ts <- t(apply(out.votes.ts, 1, function(x) x/sum(x)))
class(out.votes.ts) <- c(class(out.votes.ts), "votes")
if (labelts) {
testcon <- table(observed = ytest, predicted = out.class.ts)[levels(y),
levels(y)]
testcon <- cbind(testcon, class.error = 1 - diag(testcon)/rowSums(testcon))
}
}
cl <- match.call()
cl[[1]] <- as.name("randomForest")
out <- list(call = cl, type = if (addclass) "unsupervised" else "classification",
predicted = if (addclass) NULL else out.class, err.rate = if (addclass) NULL else t(matrix(rfout$errtr,
nclass + 1, ntree, dimnames = list(c("OOB", levels(y)),
NULL))), confusion = if (addclass) NULL else con,
votes = out.votes, oob.times = oob.times, classes = levels(y),
importance = if (importance) matrix(rfout$impout,
p, nclass + 2, dimnames = list(x.col.names, c(levels(y),
"MeanDecreaseAccuracy", "MeanDecreaseGini"))) else matrix(rfout$impout,
ncol = 1, dimnames = list(x.col.names, "MeanDecreaseGini")),
importanceSD = if (importance) matrix(rfout$impSD,
p, nclass + 1, dimnames = list(x.col.names, c(levels(y),
"MeanDecreaseAccuracy"))) else NULL, localImportance = if (localImp) matrix(rfout$impmat,
p, n, dimnames = list(x.col.names, x.row.names)) else NULL,
proximity = if (proximity) matrix(rfout$prox, n,
n, dimnames = list(x.row.names, x.row.names)) else NULL,
ntree = ntree, mtry = mtry, forest = if (!keep.forest) NULL else {
list(ndbigtree = rfout$ndbigtree, nodestatus = matrix(rfout$nodestatus,
ncol = ntree)[1:max.nodes, , drop = FALSE],
bestvar = matrix(rfout$bestvar, ncol = ntree)[1:max.nodes,
, drop = FALSE], treemap = treemap, nodepred = matrix(rfout$nodepred,
ncol = ntree)[1:max.nodes, , drop = FALSE],
xbestsplit = matrix(rfout$xbestsplit, ncol = ntree)[1:max.nodes,
, drop = FALSE], pid = rfout$classwt, cutoff = cutoff,
ncat = ncat, maxcat = maxcat, nrnodes = max.nodes,
ntree = ntree, nclass = nclass, xlevels = xlevels)
}, y = if (addclass) NULL else y, test = if (!testdat) NULL else list(predicted = out.class.ts,
err.rate = if (labelts) t(matrix(rfout$errts,
nclass + 1, ntree, dimnames = list(c("Test",
levels(y)), NULL))) else NULL, confusion = if (labelts) testcon else NULL,
votes = out.votes.ts, proximity = if (proximity) matrix(rfout$proxts,
nrow = ntest, dimnames = list(xts.row.names,
c(xts.row.names, x.row.names))) else NULL),
inbag = if (keep.inbag) matrix(rfout$inbag, nrow = nrow(rfout$inbag),
dimnames = list(x.row.names, NULL)) else NULL)
}
else {
ymean <- mean(y)
y <- y - ymean
ytest <- ytest - ymean
rfout <- .C("regRF", x, as.double(y), as.integer(c(n,
p)), as.integer(sampsize), as.integer(nodesize),
as.integer(nrnodes), as.integer(ntree), as.integer(mtry),
as.integer(c(importance, localImp, nPerm)), as.integer(ncat),
as.integer(maxcat), as.integer(do.trace), as.integer(proximity),
as.integer(oob.prox), as.integer(corr.bias), ypred = double(n),
impout = impout, impmat = impmat, impSD = impSD,
prox = prox, ndbigtree = integer(ntree), nodestatus = matrix(integer(nrnodes *
nt), ncol = nt), leftDaughter = matrix(integer(nrnodes *
nt), ncol = nt), rightDaughter = matrix(integer(nrnodes *
nt), ncol = nt), nodepred = matrix(double(nrnodes *
nt), ncol = nt), bestvar = matrix(integer(nrnodes *
nt), ncol = nt), xbestsplit = matrix(double(nrnodes *
nt), ncol = nt), mse = double(ntree), keep = as.integer(c(keep.forest,
keep.inbag)), replace = as.integer(replace),
testdat = as.integer(testdat), xts = xtest, ntest = as.integer(ntest),
yts = as.double(ytest), labelts = as.integer(labelts),
ytestpred = double(ntest), proxts = proxts, msets = double(if (labelts) ntree else 1),
coef = double(2), oob.times = integer(n), inbag = if (keep.inbag) matrix(integer(n *
ntree), n) else integer(1), DUP = FALSE, PACKAGE = "randomForest")[c(16:28,
36:41)]
if (keep.forest) {
max.nodes <- max(rfout$ndbigtree)
rfout$nodestatus <- rfout$nodestatus[1:max.nodes,
, drop = FALSE]
rfout$bestvar <- rfout$bestvar[1:max.nodes, , drop = FALSE]
rfout$nodepred <- rfout$nodepred[1:max.nodes, , drop = FALSE] +
ymean
rfout$xbestsplit <- rfout$xbestsplit[1:max.nodes,
, drop = FALSE]
rfout$leftDaughter <- rfout$leftDaughter[1:max.nodes,
, drop = FALSE]
rfout$rightDaughter <- rfout$rightDaughter[1:max.nodes,
, drop = FALSE]
}
cl <- match.call()
cl[[1]] <- as.name("randomForest")
ypred <- rfout$ypred
if (any(rfout$oob.times < 1)) {
ypred[rfout$oob.times == 0] <- NA
}
out <- list(call = cl, type = "regression", predicted = structure(ypred +
ymean, names = x.row.names), mse = rfout$mse, rsq = 1 -
rfout$mse/(var(y) * (n - 1)/n), oob.times = rfout$oob.times,
importance = if (importance) matrix(rfout$impout,
p, 2, dimnames = list(x.col.names, c("%IncMSE",
"IncNodePurity"))) else matrix(rfout$impout,
ncol = 1, dimnames = list(x.col.names, "IncNodePurity")),
importanceSD = if (importance) rfout$impSD else NULL,
localImportance = if (localImp) matrix(rfout$impmat,
p, n, dimnames = list(x.col.names, x.row.names)) else NULL,
proximity = if (proximity) matrix(rfout$prox, n,
n, dimnames = list(x.row.names, x.row.names)) else NULL,
ntree = ntree, mtry = mtry, forest = if (keep.forest) c(rfout[c("ndbigtree",
"nodestatus", "leftDaughter", "rightDaughter",
"nodepred", "bestvar", "xbestsplit")], list(ncat = ncat),
list(nrnodes = max.nodes), list(ntree = ntree),
list(xlevels = xlevels)) else NULL, coefs = if (corr.bias) rfout$coef else NULL,
y = y + ymean, test = if (testdat) {
list(predicted = structure(rfout$ytestpred +
ymean, names = xts.row.names), mse = if (labelts) rfout$msets else NULL,
rsq = if (labelts) 1 - rfout$msets/(var(ytest) *
(n - 1)/n) else NULL, proximity = if (proximity) matrix(rfout$proxts/ntree,
nrow = ntest, dimnames = list(xts.row.names,
c(xts.row.names, x.row.names))) else NULL)
} else NULL, inbag = if (keep.inbag) matrix(rfout$inbag,
nrow(rfout$inbag), dimnames = list(x.row.names,
NULL)) else NULL)
}
class(out) <- "randomForest"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.