packrat/lib-R/x86_64-w64-mingw32/3.6.1/rpart/tests/testall.R

# 
# This is a set of tests, with printout, that tries
#  to exercise all of the splitting rules
# It's less formal than the others in this directory, but covers
#  more.
# For all of the cross-validations I set xgroups explicitly, so as
#  to avoid any changes in random number allocation of the groups.

library(rpart)
require(survival)
options(digits=4)  #avoid trivial rounding changes across R versions
set.seed(10)
# 
#
# Survival, using the stagec data
#
#   Time to progression in years
#   status  1=progressed, 0= censored
#   age
#   early endocrine therapy   1=no 2=yes
#   % of cells in g2 phase, from flow cytometry
#   tumor grade (Farrow) 1,2,3,4
#   Gleason score (competing grading system)
#   ploidy
xgroup <- rep(1:10, length=nrow(stagec))
fit1 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
		stagec, method="poisson",
              control=rpart.control(usesurrogate=0, cp=0, xval=xgroup))
fit1
summary(fit1)

fit2 <- rpart(Surv(pgtime, pgstat) ~ age + eet + g2+grade+gleason +ploidy,
		stagec, xval=xgroup)
fit2
summary(fit2)


#
# Continuous y variable
#  Use deterministic xgroups: the first group is the 1st, 11th, 21st, etc
#  smallest obs, the second is 2, 12, 22, ...

mystate <- data.frame(state.x77, region=factor(state.region))
names(mystate) <- c("population","income" , "illiteracy","life" ,
       "murder", "hs.grad", "frost",     "area",      "region")

xvals <- 1:nrow(mystate)
xvals[order(mystate$income)] <- rep(1:10, length=nrow(mystate))

fit4 <- rpart(income ~ population + region + illiteracy +life + murder +
			hs.grad + frost , mystate,
		   control=rpart.control(minsplit=10, xval=xvals))

summary(fit4)


#
# Check out xpred.rpart
#
meany <- mean(mystate$income)
xpr <- xpred.rpart(fit4, xval=xvals)
xpr2 <- (xpr - mystate$income)^2
risk0 <- mean((mystate$income - meany)^2)
xpmean <- as.vector(apply(xpr2, 2, mean))   #kill the names
all.equal(xpmean/risk0, as.vector(fit4$cptable[,'xerror']))

xpstd <- as.vector(apply((sweep(xpr2, 2, xpmean))^2, 2, sum))
xpstd <- sqrt(xpstd)/(50*risk0)
all.equal(xpstd, as.vector(fit4$cptable[,'xstd']))

#
# recreate subset #3 of the xval
#
tfit4 <- rpart(income ~ population + region + illiteracy +life + murder +
			hs.grad + frost , mystate,  subset=(xvals!=3),
		   control=rpart.control(minsplit=10, xval=0))
tpred <- predict(tfit4, mystate[xvals==3,])
all.equal(tpred, xpr[xvals==3,ncol(xpr)])

# How much does this differ from the "real" formula, more complex,
#   found on page 309 of Breiman et al. ?
#xtemp <- (xpr2/outer(rep(1,50),xpmean)) -  ((mystate$income - meany)^2)/risk0
#real.se<- xpmean* sqrt(apply(xtemp^2,2,sum))/(risk0*50)


# Simple yes/no classification model
fit5 <- rpart(factor(pgstat) ~  age + eet + g2+grade+gleason +ploidy,
	  stagec, xval=xgroup)

fit5

fit6 <- rpart(factor(pgstat) ~  age + eet + g2+grade+gleason +ploidy,
		stagec, parm=list(prior=c(.5,.5)), xval=xgroup)
summary(fit6)
#
# Fit a classification model to the car data.
#  Now, since Reliability is an ordered category, this model doesn't
# make a lot of statistical sense, but it does test out some
# areas of the code that nothing else does
#
xcar <- rep(1:8, length=nrow(cu.summary))
carfit <- rpart(Reliability ~ Price + Country + Mileage + Type,
		   method='class', data=cu.summary, xval=xcar)

summary(carfit)
jmcascalheira/LGMIberiaCluster documentation built on June 8, 2021, 10 a.m.