Nothing
#' SQBreg
#'
#' Do prediction using sequential bagging method with tree based learning algorithm
#' @name SQBreg
#' @param data.train Training dataset
#' @param data.test Testing dataset
#' @param y Numeric response variable
#'
#' @param res Resampling size. Could not be greater than the input data size.
#' @param reps Replicates for the first bagging, default 100
#' @param cores Use multi-cores, default one core, use cores='maxcores' for full use.
#' @param FunKDE Kernel density estimate function. Use different kernel to fit, default logistic kernel.
#' @param control Use in rpart package, rpart.control to tune the control
#' @param SQBalgorithm.1 Use for the initial training. Option: CART, lm(default), knnreg, nnet, PCR.
#' @param SQBalgorithm.2 Use for the last training. Option: CART, lm(default), knnreg, nnet, PCR.
#' @param k The number of nearest neighbour used for knnreg
#' @param ncomp The number of component used for PCR
#' @param nnet.size The number of hidden layer and neuron for nnet
#' @importFrom rpart rpart
#' @importFrom rpart rpart.control
#' @importFrom parallel mclapply
#' @import stats
#' @import utils
#' @importFrom caret knnreg
#' @importFrom nnet nnet
#' @importFrom pls pcr
#' @return Given testing set input, make a regression prediction
#' @references Breiman L., Friedman J. H., Olshen R. A., and Stone, C. J. (1984)
#' \emph{Classification and Regression Trees.}
#' @references Soleymani, M. and Lee S.M.S(2014). Sequential combination of weighted and nonparametric bagging for classification. \emph{Biometrika}, 101, 2, pp. 491--498.
#' @references Efron, B. (1979). Bootstrap methods: Another lo ok at the jackknife. Ann. Statist., 7(1):1-26.
#' @examples
#' data(hills, package="MASS")
#' rt.df <- hills[sample(nrow(hills)),]
#' data.train <- rt.df[1 : (length(rt.df[, 1]) - 1), ]
#' data.test <- rt.df[-(1 : (length(rt.df[, 1]) - 1)),]
#' fit <- SQBreg(data.train, data.test, reps = 30, y = "time")
#' fit
#'
#' @export
SQBreg <- function(data.train, data.test, y, res, reps,
cores, FunKDE, control, SQBalgorithm.1, SQBalgorithm.2, k, ncomp, nnet.size){
pb <- txtProgressBar(min = 0, max = 4, style = 3) #
data.train = data.train[,c(which(colnames(data.train) != y), which(colnames(data.train) == y))]
data.test = data.test[,c(which(colnames(data.test) != y), which(colnames(data.test)==y))]
mingzi <- paste("x", 1 : length(data.train), sep = "")
mingzi[length(mingzi)] <- "y1"
colnames(data.train) <- mingzi
colnames(data.test) <- mingzi
formula <- y1 ~ .
n.predictors = ncol(data.train) - 1
maxit <- nrow(data.train) # input data size#
index <- 1 : maxit
response <- n.predictors + 1 # The column position of response
if (length(data.test) != length(data.train)){
stop("training and testing sets must have the same predictors and the same response")
}
if (missing(y))
stop("y is the response variable that must exist")
if (!is.data.frame(data.train) || !is.data.frame(data.test))
stop("'Input datasets must be data frame")
if (is.na(data.train) || is.na(data.test))
stop("NA must be removed from the data")
if (length(data.train) != length(data.test))
stop("Unequal column length")
if (missing(control)){
control = list(minsplit = 10, cp=0) # CART
}
if (missing(k)){
k = 1 # KNN
}
if (missing(nnet.size)){
nnet.size = 2
}
if (nnet.size < 2){
warning('Invalid hidden layer or neuron, system modify to 2')
nnet.size = 2
}
if (missing(ncomp) || ncomp == 0 || n.predictors == 1){
ncomp = 1
}
if (ncomp == n.predictors && n.predictors != 1){
ncomp = n.predictors - 1
}
if (missing(res)){
res = round(maxit / 2)
}
if (maxit <= res - 3){
warning('Function modifies the number of resampling that it is too close or over the training size.')
res = maxit - 4
}
if (missing(reps)){
reps = 100
}
if (missing(SQBalgorithm.1)){
SQBalgorithm.1 = "lm"
}
if (missing(SQBalgorithm.2)){
SQBalgorithm.2 = "lm"
}
RegTree <- function(formula, data.train, res, index, SQBalgorithm, control,...) {
SQBalgorithm = SQBalgorithm.1
store <- double(maxit)
for (i in index) {
j <- index[i]
subindex1 <- sample((1 : maxit)[-j], res, replace=F)
bootstrap.sample1 <- data.train[subindex1, ]
if (SQBalgorithm == "CART"){
fit1.step2.lm1 <- rpart(formula=formula, data=bootstrap.sample1, method="anova", control=control)
}
if (SQBalgorithm == "lm"){
fit1.step2.lm1 <- lm(formula=formula, data=bootstrap.sample1)
}
if (SQBalgorithm == "KNN"){
fit1.step2.lm1 <- knnreg(formula = formula, data = bootstrap.sample1, k = k)
}
if (SQBalgorithm == "nnet"){
fit1.step2.lm1 <- nnet(formula = formula, size = nnet.size, data = bootstrap.sample1, linout = T, trace = F)
}
if (SQBalgorithm == "PCR"){
fit1.step2.lm1 <- pcr(formula, data = bootstrap.sample1, scale =TRUE, validation = "CV")
}
pred.lm <- as.numeric(predict(fit1.step2.lm1, newdata=data.train[j, ], ncomp = ncomp))
store[i] <- pred.lm
}
store
}
# 1 cutoff
setTxtProgressBar(pb, 1)
if (missing(cores) || cores == 1){
cores = F
res.replicate <- replicate(reps, RegTree(formula=formula, data.train, res=res, index = index,
SQBalgorithm = SQBalgorithm.1))
}
else if (1 < cores & cores < 1 + getOption("mc.cores", parallel::detectCores())) {
res.replicate <- mclapply(1 : reps, function(itr) {
RegTree(formula=formula, data.train, res = res, index = index,
SQBalgorithm = SQBalgorithm.1)},
mc.cores = cores)
res.replicate <- matrix(unlist(res.replicate), ncol = reps)
}
else if (cores == "maxcores"){
cores = getOption("mc.cores", parallel::detectCores())
res.replicate <- mclapply(1 : reps, function(itr) {
RegTree(formula=formula, data.train, res, index=index,
SQBalgorithm = SQBalgorithm.1)},
mc.cores = cores)
res.replicate <- matrix(unlist(res.replicate), ncol = reps)
}
else if (cores > getOption("mc.cores", parallel::detectCores()) || cores < 1 || cores %% 1 != 0){
stop("The use number of cores is invalid")
}
setTxtProgressBar(pb, 2)
new.reg.lm100 <- res.replicate
setTxtProgressBar(pb, 3)
if (missing(FunKDE) || FunKDE == "gaussian")
{
FunKDE = function(new.reg.lm100, reps, SIGMA) {
if (missing(SIGMA)){
SIGMA <- 1 #
}
c <- 1 / sqrt(1 + SIGMA^2)
sigma.hat <- apply(new.reg.lm100, 1, sd)
meanMatrix <- matrix(rep(rowMeans(new.reg.lm100), reps), ncol = reps)
sdMatrix <- matrix(rep(sigma.hat , reps), ncol = reps)
Zi <- matrix(rnorm(length(sdMatrix), 0, SIGMA), ncol = reps) #
norm.generator150 <- meanMatrix + c * (new.reg.lm100 - meanMatrix +
sdMatrix * Zi)
return(norm.generator150)
}
}
else if (FunKDE == "logistic")
{
FunKDE = function(new.reg.lm100, reps, SIGMA) {
if (missing(SIGMA)){
SIGMA <- sqrt(pi ^ 2 / 3) #
}
c <- 1 / sqrt(1 + SIGMA^2)
meanMatrix <- matrix(rep(rowMeans(new.reg.lm100), reps), ncol = reps)
sdMatrix <- matrix(rep(apply(new.reg.lm100, 1, sd), reps), ncol = reps)
Zi <- matrix(rlogis(length(sdMatrix), 0, SIGMA), ncol = reps) #
norm.generator150 <- meanMatrix + c * (new.reg.lm100 - meanMatrix + sdMatrix * Zi)
return(norm.generator150)
}
}
#
else if (FunKDE == "rectangle")
{
FunKDE = function(new.reg.lm100, reps, SIGMA) {
if (missing(SIGMA)){
SIGMA <- sqrt(1/3) #
}
c <- 1 / sqrt(1 + SIGMA^2)
sigma.hat <- apply(new.reg.lm100, 1, sd)
meanMatrix <- matrix(rep(rowMeans(new.reg.lm100), reps), ncol = reps)
sdMatrix <- matrix(rep(apply(new.reg.lm100, 1, sd), reps), ncol = reps)
Zi <- matrix(runif(length(sdMatrix), -1, 1), ncol = reps) #
norm.generator150 <- meanMatrix + c * (new.reg.lm100 - meanMatrix +
sdMatrix * Zi)
return(norm.generator150)
}
}
else if (FunKDE == "normal"){
KDE100.generator150 = matrix(0, nrow = nrow(new.reg.lm100), ncol = ncol(new.reg.lm100))
FunKDE = function(new.reg.lm100, reps, SIGMA){
SIGMA = NULL
reps = reps
sigma.hat <- apply(new.reg.lm100, 1, sd)
mean.hat <- rowMeans(new.reg.lm100)
for (iteration in 1 : nrow(new.reg.lm100)){
KDE100.generator150[iteration, ] <-
rnorm(reps, mean.hat[iteration], sigma.hat[iteration])
}
return(KDE100.generator150)
}
}
else if (FunKDE == "uniform"){
KDE100.generator150 = matrix(0, nrow = nrow(new.reg.lm100), ncol = ncol(new.reg.lm100))
FunKDE = function(new.reg.lm100, reps, SIGMA) {
SIGMA = NULL
reps = reps
min.hat <- apply(new.reg.lm100, 1, min)
max.hat <- apply(new.reg.lm100, 1, max)
for (iteration in 1 : nrow(new.reg.lm100)){
KDE100.generator150[iteration,] <-
runif(reps, min.hat[iteration], max.hat[iteration])
}
return(KDE100.generator150)
}
}
else if (FunKDE == "logis"){
KDE100.generator150 = matrix(0, nrow = nrow(new.reg.lm100), ncol = ncol(new.reg.lm100))
FunKDE = function(new.reg.lm100, reps, SIGMA) {
SIGMA = NULL
reps = reps
mean.hat <- apply(new.reg.lm100, 1, mean)
scale.hat <- apply(new.reg.lm100, 1, sd)
for (iteration in 1 : nrow(new.reg.lm100)){
KDE100.generator150[iteration,] <-
rlogis(reps, mean.hat[iteration], scale.hat[iteration])
}
return(KDE100.generator150)
}
}
else {
stop("response generator invalid, should select: 'normal', 'uniform', 'logis',
'gaussian', 'rectangle', 'logistic'. ")
}
#
KDE100.generator150 <- FunKDE(new.reg.lm100, reps)
KDEtraining.newL <- data.frame(data.train[, -length(data.train), drop = FALSE], KDE100.generator150)
KDE.fit <- matrix(0, nrow = nrow(data.test) + nrow(data.train) - maxit, ncol = reps)
if(SQBalgorithm.2 == "CART"){
for (i in (n.predictors + 1) : (reps + n.predictors)) {
gongshi.KDE <- as.formula(paste("KDEtraining.newL[, i] ~",
paste(attr(terms.formula(formula, data = data.train), "term.labels"),
sep = "", collapse = "+")))
KDE.model <- rpart(formula = gongshi.KDE, data = KDEtraining.newL, method="anova")
KDE.fit[, i - n.predictors] <- predict(KDE.model, newdata=data.test)
}
}
if(SQBalgorithm.2 == "lm"){
for (i in (n.predictors + 1) : (reps + n.predictors)) {
gongshi.KDE <- as.formula(paste("KDEtraining.newL[, i] ~",
paste(attr(terms.formula(formula, data = data.train), "term.labels"),
sep = "", collapse = "+")))
KDE.model <- lm(formula = gongshi.KDE, data = KDEtraining.newL)
KDE.fit[, i - n.predictors] <- predict(KDE.model, newdata=data.test)
}
}
if(SQBalgorithm.2 == "KNN"){
for (i in (n.predictors + 1) : (reps + n.predictors)) {
gongshi.KDE <- as.formula(paste("KDEtraining.newL[, i] ~",
paste(attr(terms.formula(formula, data = data.train), "term.labels"),
sep = "", collapse = "+")))
KDE.model <- knnreg(formula = gongshi.KDE, data = KDEtraining.newL, k = k)
KDE.fit[, i - n.predictors] <- predict(KDE.model, newdata=data.test)
}
}
if(SQBalgorithm.2 == "nnet"){
for (i in (n.predictors + 1) : (reps + n.predictors)) {
gongshi.KDE <- as.formula(paste("KDEtraining.newL[, i] ~",
paste(attr(terms.formula(formula, data = data.train), "term.labels"),
sep = "", collapse = "+")))
KDE.model <- nnet(formula = gongshi.KDE, data = KDEtraining.newL, size = nnet.size, linout = T, trace = F)
KDE.fit[, i - n.predictors] <- as.numeric(predict(KDE.model, newdata=data.test))
# #
}
}
if(SQBalgorithm.2 == "PCR"){
for (i in (n.predictors + 1) : (reps + n.predictors)) {
gongshi.KDE <- as.formula(paste("KDEtraining.newL[, i] ~",
paste(attr(terms.formula(formula, data = data.train), "term.labels"),
sep = "", collapse = "+")))
KDE.model <- pcr(formula = gongshi.KDE, data = KDEtraining.newL, scale =TRUE, validation = "CV")
KDE.fit[, i - n.predictors] <- as.numeric(predict(KDE.model, newdata=data.test, ncomp = ncomp))
# #
}
}
# 3 cutoff
KDE.fit <- as.data.frame(KDE.fit)
final.prediction <- rowMeans(KDE.fit) ##################### Prediction results for testset.
# 4 cutoff
setTxtProgressBar(pb, 4)
return(final.prediction)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.