# use these to preprocess categorical predictors for knn
code.factor <- function(x,y) {
if(!is.factor(x)) return(x)
if(is.factor(y)) {
tab = table(x,y)
p = tab[,2]/(tab[,1]+tab[,2])
} else {
p = as.vector(as.matrix(prototypes(y,x)))
#p = p - mean(p)
names(p) = levels(x)
}
print(round(sort(p),2))
p[x]
}
code.factors <- function(x,type=c("indicator","effect")) {
# returns a new data frame where all categorical predictors are numerically
# coded.
resp = response.var(x)
type = match.arg(type)
if(type == "indicator") {
# ordered factors are made integer, others made indicator
x = apply.df(x,function(s) if(is.ordered(s)) as.numeric(s) else s)
m = model.matrix(terms(x),x)[,-1,drop=F]
cbind(as.data.frame(m),x[resp])
} else {
y = x[[resp]]
pred = predictor.vars(x)
for(k in pred) {
if(is.factor(x[[k]])) cat(k,":\n")
x[[k]] = code.factor(x[[k]],y)
}
x
}
}
set.sublist <- function(lst,i,v) {
if(i == 1) c(v,lst[-1])
else if(i == length(lst)) c(lst[-i],v)
else c(lst[1:(i-1)],v,lst[(i+1):length(lst)])
}
make.ordered.factors <- function(x,exclude=NULL) {
# convert unordered factors in x with >2 levels into multiple
# binary factors.
is.unordered = function(f){is.factor(f) && !is.ordered(f) && nlevels(f)>2}
vs = names(which(sapply(x,is.unordered)))
vs = setdiff(vs,exclude)
x = as.list(x)
for(v in vs) {
y = indicators.factor(x[[v]])[-1,,drop=F]
rownames(y) = paste(v,rownames(y),sep="")
y = as.list(as.data.frame(t(y)))
y = lapply(y,factor.logical)
i = which(names(x) == v)
x = set.sublist(x,i,y)
}
as.data.frame(x)
}
#' Convert data frame columns to numeric
#'
#' Factors are also converted to numeric.
#' @param data data.frame
#' @param exclude columns to exclude
#' @param warn allow warnings
#'
#' @export
#' @examples
#' make_numeric_data_frame(iris)
make_numeric_data_frame <- function(data, exclude=NULL, warn=FALSE) {
i <- !sapply(data, is.numeric)
i[exclude] <- FALSE
if(any(i)) {
if(warn) warning("converting factors to numeric")
data[i] <- apply.df(data[i], as.numeric)
}
data
}
knn.model <- function(formula,data=NULL,k=1) {
# works for Splus too
library(class)
if(is.null(data)) data <- formula
else data <- model.frame(formula,data)
pred <- predictor.vars(data)
data[pred] = make_numeric_data_frame(data[pred],warn=T)
object <- list(terms=terms(data),data=data,k=k)
class(object) <- "knn"
object$scale <- sd(data[pred])
object$scale[object$scale == 0] = 1
resp <- response.var(data)
p <- table(data[[resp]])
object$tie.breaker <- levels(data[[resp]])[which.max(p)]
object
}
print.knn <- function(object) {
cat("nearest neighbor classifier, k =",format(object$k),"\n")
pred <- predictor.vars(object)
cat(nrow(object$data), "examples in", length(pred),"dimensions\n")
}
model.frame.knn <- function(object) {
object$data
}
predict.knn <- function(object,test,k,type=c("class","vector","response"),...) {
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
test[pred] = make_numeric_data_frame(test[pred])
s <- object$scale
# subtracting mean is irrelevant
x <- scale(train[pred],center=F,scale=s)
xt <- scale(test[pred],center=F,scale=s)
if(missing(k)) k <- object$k
type <- match.arg(type)
r <- knn(x,xt,train[[resp]],k=k,l=floor(k/2+1),prob=(type!="class"))
r[is.na(r)] <- object$tie.breaker
if(type == "vector") {
p <- attr(r,"prob")
lev <- levels(train[[resp]])
v <- array(0,c(length(r),length(lev)),list(rownames(test),lev))
for(j in 1:length(lev)) {
i <- (r == lev[j])
v[i,j] <- p[i]
v[i,-j] <- 1-p[i]
}
r <- v
} else if(type == "response") {
p = attr(r,"prob")
lev <- levels(train[[resp]])
v = p
i = which(r == lev[1])
v[i] = 1-p[i]
names(v) = rownames(test)
r = v
}
r
}
misclass.knn <- function(object,data,rate=F) {
if(missing(data)) data <- model.frame(object)
resp <- response.var(object)
r = sum(predict(object,data) != data[[resp]])
if(rate) r/nrow(data) else r
}
deviance.knn <- function(object,data,rate=F) {
if(missing(data)) data <- model.frame(object)
resp <- response.var(object)
y = data[[resp]]
p = predict(object,data,type="vector")
dev = 0
for(lev in levels(y)) {
dev = dev + sum(log(p[y==lev,lev]))
}
dev = -2*dev
if(rate) dev/nrow(data) else dev
}
# should use deviance, not misclass
best.k.knn <- function(object,ks=1:50) {
# find k with best cross-validation performance
# uses leave-one-out only
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
s <- object$scale
x <- scale(train[pred],center=F,scale=s)
max.k = max(table(train[[resp]]))
ks = ks[ks <= max.k]
r <- c()
for(i in 1:length(ks)) {
y <- knn.cv(x,train[[resp]],k=ks[i],l=floor(ks[i]/2+1))
y[is.na(y)] <- object$tie.breaker
r[i] <- sum(y != train[[resp]])
}
plot(ks,r,xlab="number of neighbors (k)",ylab="misclass",type="o")
title("Leave-one-out cross-validation")
i <- min(which(r==min(r)))
cat("best k is",ks[i],"\n")
object$k <- ks[i]
object
}
test.k.knn <- function(object,test,ks=1:50) {
# find k with fewest test errors
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
test[pred] = make_numeric_data_frame(test[pred])
s <- object$scale
x <- scale(train[pred],center=F,scale=s)
xt = scale(test[pred],center=F,scale=s)
max.k = max(table(train[[resp]]))
ks = ks[ks <= max.k]
r = c()
for(i in 1:length(ks)) {
y = knn(x,xt,train[[resp]],k=ks[i],l=floor(ks[i]/2+1))
y[is.na(y)] <- object$tie.breaker
r[i] = sum(y != test[[resp]])
}
plot(ks,r,xlab="k",ylab="misclass",type="o")
i <- which.min(r)
cat("best k on test is",ks[i],"\n")
object$k <- ks[i]
object
names(r) = ks
invisible(r)
}
# score predictors via leave-one-out
# uses sort.data.frame
drop1.knn <- function(object) {
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
s <- object$scale
x <- scale(train[pred],center=F,scale=s)
x <- data.frame(x)
yt <- train[[resp]]
# score original model
y <- knn.cv(x,yt,k=object$k,l=floor(object$k/2+1))
y[is.na(y)] <- object$tie.breaker
e <- sum(y != yt)
res <- data.frame(k=object$k,misclass=e,row.names="<none>")
# score each reduced model
ks = 1:20
for(i in 1:length(pred)) {
e = c()
for(j in 1:length(ks)) {
k = ks[j]
y <- knn.cv(not(x,pred[i]),yt,k=k,l=floor(k/2+1))
y[is.na(y)] <- object$tie.breaker
e[j] <- sum(y != yt)
}
k = ks[which.min(e)]
e = min(e)
res <- rbind(res, data.frame(k=k,misclass=e,row.names=paste("-",pred[i])))
}
sort.data.frame(res)
}
# remove predictors, using drop1.knn
step.knn <- function(object) {
resp <- response.var(object)
repeat {
a <- drop1.knn(object)
i <- which(a$misclass == min(a$misclass))
i <- i[rownames(a)[i] != "<none>"]
if(length(i) == 0) {
print(a)
break
}
i = i[1]
pred <- predictor.vars(object)
omit <- substring(rownames(a)[i], 3)
cat("dropping",omit,"\n")
pred <- pred[!(pred %in% omit)]
object$data <- not(object$data, omit)
object$scale <- not(object$scale, omit)
object$k = a$k[i]
object$terms <- terms(formula(paste(resp,"~",paste(pred,collapse="+"))))
}
object
}
reduce.knn <- function(object) {
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
s <- object$scale
x <- scale(train[pred],center=F,scale=s)
y <- train[[resp]]
if(object$k == 1) {
if(F) keep <- condense(x,y,trace=F)
else keep <- 1:nrow(x)
keep <- reduce.nn(x,keep,y)
} else {
keep <- multiedit(x,y,object$k)
}
object$data <- object$data[keep,]
if(object$k > nrow(object$data)) {
object$k <- nrow(object$data)-1
cat("changing k to",object$k,"\n")
}
object
}
proto.knn <- function(object,np=1) {
# reduce the examples to np cluster centers in each class
train <- object$data
resp <- response.var(object)
pred <- predictor.vars(object)
s <- object$scale
x <- scale(train[pred],center=F,scale=s)
y <- train[[resp]]
nc = nlevels(y)
if(length(np) == 1) {
np = rep(np,nc)
if(F) {
p = table(y)
p = p/min(p)
np = round(np*p)
print(np)
}
}
if(is.null(names(np))) names(np) = levels(y)
r = list()
for(lev in levels(y)) {
x.lev = x[y==lev,]
if(np[[lev]] == 1) {
r[[lev]] = as.data.frame.row(mean.data.frame(x.lev),"1")
} else if(np[[lev]] == nrow(x.lev)) {
r[[lev]] = x.lev
} else {
cl = kmeans(x.lev,np[[lev]])
r[[lev]] = as.data.frame(cl$centers)
}
rownames(r[[lev]]) = paste(lev,rownames(r[[lev]]),sep=".")
}
x = scale(rbind(r[[1]],r[[2]]),center=F,scale=1/s)
y = rep(levels(y),times=sapply(r,nrow))
object$data = cbind(y,x)
names(object$data)[1] = resp
object
}
gcl.model <- function(formula,data,equal.mean=F,
type=c("full","diag","sphere")) {
# Gaussian classifier
type = match.arg(type)
if(is.null(data)) data <- formula
else data <- model.frame(formula,data)
pred = predictor.vars(data)
data[pred] = make_numeric_data_frame(data[pred],warn=T)
y = data[[response.var(data)]]
m = v = p = bias = list()
for(lev in levels(y)) {
x = data[y==lev,pred,drop=F]
m[[lev]] = if(equal.mean) mean(data[pred]) else mean(x)
if(type == "full") {
v[[lev]] = t(chol(ml.cov(x)))
} else if(type == "sphere") {
v[[lev]] = diag(ncol(x))*sqrt(mean(diag(cov(x))))
} else {
v[[lev]] = diag(sqrt(diag(cov(x))))
}
p[[lev]] = mean(y==lev)
bias[[lev]] = -2*log(p[[lev]]) - sum(log(diag(v[[lev]])))
}
object = list(m=m,v=v,p=p,bias=bias,terms=terms(formula),call=match.call())
class(object) = "gcl"
object
}
predict.gcl <- function(object,test,type=c("class","response"),se=F) {
pred <- predictor.vars(object)
test[pred] = make_numeric_data_frame(test[pred])
classes = names(object$m)
e = array(0,c(nrow(test),length(classes)),list(rownames(test),classes))
for(lev in classes) {
xt = test[pred] - rep.row(object$m[[lev]],nrow(test[pred]))
xt = forwardsolve(object$v[[lev]],t(xt))
e[,lev] = t(colSums(xt*xt)) + object$bias[[lev]]
}
type = match.arg(type)
if(type == "class") classes[apply(e,1,which.min)]
else if(type == "response") 1/(1+exp(-0.5*(e[,1]-e[,2])))
}
misclass.gcl = misclass.knn
# Useful functions for making maps
locate.bbox <- function() {
# use the mouse to pick a bounding box
coord = locator(2,type="p",pch="+",col="red",cex=3)
bbox = c(sort(coord$x),sort(coord$y))
rect(bbox[1],bbox[3],bbox[2],bbox[4],col=NA,border="red")
bbox
}
# convert census COB file into map object
# if centers=T, returns a data frame with columns x and y
# note: names may have duplicates (multi-polygon tracts)
read.cob <- function(pfile,pafile=NULL,centers=F) {
s <- scan(pfile,"",quiet=T)
# remove last two ENDs
s <- s[-length(s)]
s <- s[-length(s)]
breaks <- which(s == "END")
index <- s[c(1,breaks+1)]
if(is.null(pafile)) {
nam <- index
} else {
spa <- scan(pafile,"",quiet=T)
nam <- spa[seq(2,length(spa),by=2)]
names(nam) <- spa[seq(1,length(spa),by=2)]
# order names to match the p file
# tracts with bogus index will have name "NA"
nam <- nam[index]
if(centers) {
# make names unique
dup <- which(duplicated(nam))
for(i in dup) {
if(!is.na(nam[i])) {
j <- which(nam == nam[i])
nam[j] <- paste(nam[j],as.character(1:length(j)),sep=":")
}
}
}
}
# remove internal names
names(nam) = NULL
if(centers) {
# keep only first two coordinates of each block
x <- as.numeric(s[c(2,breaks+2)])
y <- as.numeric(s[c(3,breaks+3)])
i <- !is.na(nam)
data.frame(x=x[i],y=y[i],row.names=nam[i])
} else {
# change END to NA,NA
s[c(breaks,breaks+1)] <- "NA"
# remove first two coordinates of each block
i <- rep(T,length(s))
i[1:3] <- F
i[c(breaks+2, breaks+3)] <- F
s <- as.numeric(s[i])
list(x=s[seq(1,length(s),by=2)],y=s[seq(2,length(s),by=2)],names=nam)
}
}
# convert a FIPS code into a comma-separated region name
fips.split = function(x) {
sapply(x, function(s) paste(substring(s,1,5),substring(s,6),sep=","))
}
# choropleth map
map.vector <- function(m,x,main="",nlevels=NULL,key=T,warn=T,
col=color.palette,color.palette=YlGnBu.colors,
bg=gray(0.5),breaks,mar,...) {
if(is.data.frame(x)) {
df <- x; x <- df[[1]]; names(x) <- rownames(df)
if(missing(main)) main <- colnames(df)[1]
}
if(!warn && length(setdiff(names(x),m$names)) > 0) {
warning("some regions have data but do not appear on the map")
}
#print(setdiff(m$names,names(x)))
j <- match.map(m,names(x),warn=warn)
x <- as.numeric(x)
# determine breaks,nlevels
if(missing(breaks)) {
if(is.null(nlevels)) {
if(is.function(col)) nlevels = 5
else nlevels = length(col)
}
breaks <- break.quantile(x,nlevels,pretty=T)
} else nlevels = length(breaks)-1
# breaks,nlevels are now known
if(is.function(col)) col = col(nlevels)
else if(nlevels != length(col)) warning("some colors not used")
a <- cut(x,breaks,include=T)
if(missing(mar)) {
mar = c(0,0,0,0.1)
mar[3] = if(key) 2 else 1
}
opar = par(mar=mar)
on.exit(par(opar))
map(m,fill=T,color=col[as.numeric(a)[j]],res=0,bg=bg,mar=mar,...)
if(key) {
if(nlevels > 50) color.key(col)
else color.key(col,breaks=breaks)
title(main,line=1)
}
else title(main)
invisible(breaks)
}
#' @export
mosaicplot <- function(x, ...) UseMethod("mosaicplot")
### Changes by MM:
## - NULL instead of NA for default arguments, etc [R / S convention]
## - plotting at end; cosmetic
## - mosaic.cell():
### Changes by KH:
## Shading of boxes to visualize deviations from independence by
## displaying sign and magnitude of the standardized residuals.
### Changes by minka:
## Correct label placement and margins.
# Changes for Splus:
# mosaic.cell must be external, needs shade,rev,rot arguments.
# main cannot be NULL
# adj cannot be a vector
# draw border of polygon separately, to get line style.
#' @export
mosaicplot.default <-
function(X, main = "", xlab = NULL, ylab = NULL, sort = NULL, space = NULL,
dir = NULL, color = FALSE, shade = FALSE, margin = NULL,
type = c("pearson", "deviance", "FT"), rev.y = F, cex=1,
rotate = F, ...)
{
# allocate space for the first column of X, then recurse.
# relies on (1,1000) user coordinates.
# X is a matrix containing indices and data.
# space[1] is the percentage spacing.
# maxdim[1] is the cardinality.
# label[[1]] are the values.
# (x1,y1)(x2,y2) are the corners of the plotting region.
# label.x is the position of row labels
# label.y is the position of column labels
mosaic.cell <- function(X, x1, y1, x2, y2,
space, dir, color, label.x, label.y,
maxdim, currlev, label, shade)
{
# p is the column containing counts
p <- ncol(X) - 2
if (dir[1] == "v") { # split here on the X-axis.
xdim <- maxdim[1]
# XP are the column widths
XP <- rep(0, xdim)
for (i in 1:xdim) {
XP[i] <- sum(X[X[,1]==i,p]) / sum(X[,p])
}
white <- space[1] * (x2 - x1) / max(1, xdim-1)
# (x.l,x.r)[i] is the drawing region for column i
x.l <- x1
x.r <- x1 + (1 - space[1]) * XP[1] * (x2 - x1)
if (xdim > 1) {
for (i in 2:xdim) {
x.l <- c(x.l, x.r[i-1] + white)
x.r <- c(x.r, x.r[i-1] + white +
(1 - space[1]) * XP[i] * (x2 - x1))
}
}
if (label.y > 0) {
this.lab <-
if (is.null(label[[1]][1])) {
paste(rep(as.character(currlev),
length(currlev)),
as.character(1:xdim), sep=".")
} else label[[1]]
# minka
text(x= x.l + (x.r - x.l) / 2,
y= label.y,
srt=0, adj=c(.5,1), cex=cex, this.lab)
h <- max(strheight(this.lab,cex=cex))
label.y <- label.y - h
}
if (p > 2) { # recursive call.
for (i in 1:xdim) {
if (XP[i] > 0) {
mosaic.cell(as.matrix(X[X[,1]==i, -1]),
x.l[i], y1, x.r[i], y2,
space[-1],dir[-1],
color, (i==1)*label.x, label.y,
maxdim[-1],
currlev+1, label[-1], shade)
} else {
segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5,
rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5)
}
}
} else { # ncol(X) <= 1 : final split polygon and segments.
for (i in 1:xdim) {
if (XP[i] > 0) {
polygon(c(x.l[i], x.r[i], x.r[i], x.l[i]),
c(y1, y1, y2, y2),
lty = if(shade) X[i, p+1] else 1,
col = if(shade) {
color[X[i, p+2]]
} else color[i])
if(F) {
segments(c(rep(x.l[i],3),x.r[i]),
c(y1,y1,y2,y2),
c(x.r[i],x.l[i],x.r[i],x.r[i]),
c(y1,y2,y2,y1))
}
} else {
segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5,
rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5)
}
}
}
} else { ## dir[1] - "horizontal" : split here on the Y-axis.
ydim <- maxdim[1]
# YP are the row heights
YP <- rep(0, ydim)
for (j in 1:ydim) {
YP[j] <- sum(X[X[,1]==j,p]) / sum(X[,p])
}
if(rev.y) YP <- rev(YP)
white <- space[1] * (y2 - y1) / max(1, ydim - 1)
# (y.b,y.t)[i] is the drawing region for row i
y.b <- y2 - (1 - space[1]) * YP[1] * (y2 - y1)
y.t <- y2
if (ydim > 1) {
for (j in 2:ydim) {
y.b <- c(y.b, y.b[j-1] - white -
(1 - space[1]) * YP[j] * (y2 - y1))
y.t <- c(y.t, y.b[j-1] - white)
}
}
if (label.x > 0) {
this.lab <-
if (is.null(label[[1]][1])) {
paste(rep(as.character(currlev),
length(currlev)),
as.character(1:ydim), sep=".")
} else label[[1]]
# minka
if(rev.y) this.lab <- rev(this.lab)
if(rotate) {
text(x= label.x,
y= y.b + (y.t - y.b) / 2,
srt=90, adj=c(.5,1), cex=cex, this.lab)
h <- max(strheight(this.lab,cex=cex,units="inch"))
h = h/par("pin")[1]*diff(par("usr")[1:2])
label.x <- label.x + h
} else {
w <- max(strwidth(this.lab,cex=cex))
label.x <- label.x + w
text(x= label.x,
y= y.b + (y.t - y.b) / 2,
srt=0, adj=1, cex=cex, this.lab)
}
#if(label.x > left.limit) label.x <- 0
}
if (p > 2) { # recursive call.
for (j in 1:ydim) {
if (YP[j] > 0) {
mosaic.cell(as.matrix(X[X[,1]==j,2:(p+2)]),
x1, y.b[j], x2, y.t[j],
space[-1], dir[-1], color,
label.x, (j==1)*label.y,
maxdim[-1],
currlev+1, label[-1], shade)
} else {
segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3),
x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3))
}
}
} else { # ncol(X) <= 1: final split polygon and segments.
for (j in 1:ydim) {
if (YP[j] > 0) {
polygon(c(x1,x2,x2,x1),
c(y.b[j],y.b[j],y.t[j],y.t[j]),
lty = if(shade) X[j, p+1] else 1,
col = if(shade) {
color[X[j, p+2]]
} else color[j])
if(F) {
segments(c(x1,x1,x1,x2),
c(y.b[j],y.b[j],y.t[j],y.t[j]),
c(x2,x1,x2,x2),
c(y.b[j],y.t[j],y.t[j],y.b[j]))
}
} else {
segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3),
x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3))
}
}
}
}
}
##-- Begin main function
if(is.null(dim(X)))
X <- as.array(X)
else if(is.data.frame(X))
X <- data.matrix(X)
dimd <- length(dX <- dim(X))
if(dimd == 0 || any(dX == 0))
stop("`X' must not have 0 dimensionality")
##-- Set up `Ind' matrix : to contain indices and data
Ind <- 1:dX[1]
if(dimd > 1) {
Ind <- rep(Ind, prod(dX[2:dimd]))
for (i in 2:dimd) {
Ind <- cbind(Ind,
c(matrix(1:dX[i], byrow=TRUE,
nr = prod(dX[1:(i-1)]),
nc = prod(dX[i:dimd]))))
}
}
Ind <- cbind(Ind, c(X))
## Ok, now the columns of `Ind' are the cell indices (which could
## also have been created by `expand.grid()' and the corresponding
## cell counts. We add two more columns for dealing with *EXTENDED*
## mosaic plots which are produced unless `shade' is FALSE, which
## currently is the default. These columns have NAs for the simple
## case. Otherwise, they specify the line type (1 for positive and
## 2 for negative residuals) and color (by giving the index in the
## color vector which ranges from the ``most negative'' to the
## ``most positive'' residuals.
if(is.logical(shade) && !shade) {
Ind <- cbind(Ind, NA, NA)
}
else {
if(is.logical(shade))
shades <- c(2, 4)
else {
if(any(shade <= 0) || length(shade) > 5)
stop("invalid shade specification")
shades <- sort(shade)
shade <- T
}
breaks <- c(-Inf, - rev(shades), 0, shades, Inf)
if(T) {
color <- c(hsv(0, # red
s = seq(1, to = 0, length = length(shades) + 1)),
hsv(4/6, # blue
s = seq(0, to = 1, length = length(shades) + 1)))
} else {
# for S
color <- c(6, 6, 0, # red
0, 5, 5) # blue
}
if(is.null(margin))
margin <- as.list(1:dimd)
## Fit the loglinear model.
E <- loglin(X, margin, fit = TRUE, print = FALSE)$fit
## Compute the residuals.
type <- match.arg(type)
residuals <-
switch(type,
pearson = (X - E) / sqrt(E),
deviance = {
tmp <- 2 * (X * log(ifelse(X==0, 1, X/E)) - (X-E))
tmp <- sqrt(pmax(tmp, 0))
ifelse(X > E, tmp, -tmp)
},
FT = sqrt(X) + sqrt(X + 1) - sqrt(4 * E + 1))
## And add the information to the data matrix.
Ind <- cbind(Ind,
c(1 + (residuals < 0)),
as.numeric(cut(residuals, breaks)))
}
## The next four may all be NULL:
label <- dimnames(X)
nam.dn <- names(label)
if(is.null(space)) space <- rep(10, length=dimd)
if(is.null(dir)) dir <- rep(c("v","h"), length=dimd)
if(is.null(xlab)) xlab <- paste(nam.dn[dir=="v"],collapse=" / ")
if(is.null(ylab)) ylab <- paste(nam.dn[dir=="h"],collapse=" / ")
if (!is.null(sort)) {
if(length(sort) != dimd)
stop("length(sort) doesn't conform to dim(X)")
## Sort columns.
Ind[,1:dimd] <- Ind[,sort]
space <- space[sort]
dir <- dir[sort]
label <- label[sort]
}
ncolors <- length(tabulate(Ind[,dimd]))
if(!shade && ((is.null(color) || length(color) != ncolors))) {
color <- if (is.null(color) || !color[1])
rep(0, ncolors)
else
2:(ncolors+1)
}
##-- Plotting
# leave space for labels, but not tick marks
opar <- par(mar = c(1.1,1,0,0.05), mgp = c(0, 0, 0))
#opar <- par(mar = c(0.1,0,0,0.05), mgp = c(1,1,0))
on.exit(par(opar))
plot.new()
if(!shade) {
# make the limits (1,1000) exactly
par(usr=c(1,1000,1,1000))
}
else {
## This code is extremely ugly, and certainly can be improved.
## In the case of extended displays, we also need to provide a
## legend for the shading and outline patterns. The code works
## o.k. with integer breaks in `shade'; rounding to two 2 digits
## will not be good enough if `shade' has length 5.
pin <- par("pin")
rtxt <- "Standardized\nResiduals:"
## Compute cex so that the rotated legend text does not take up
## more than 1/12 of the of the plot region horizontally and not
## more than 1/4 vertically.
rtxtCex <- min(1,
pin[1] / (strheight(rtxt, units = "i") * 12),
pin[2] / (strwidth (rtxt, units = "i") / 4))
rtxtWidth <- 0.1 # unconditionally ...
## We put the legend to the right of the third axis.
# this is same as par(usr=)
par(usr=c(1, 1000 * (1.1 + rtxtWidth), 1, 1000))
rtxtHeight <-
strwidth(rtxt, units = "i", cex = rtxtCex) / pin[2]
text(1000 * (1.05 + 0.5 * rtxtWidth), 0, labels = rtxt,
adj = c(0, 0.25), srt = 90, cex = rtxtCex)
## `len' is the number of positive or negative intervals of
## residuals (so overall, there are `2 * len')
len <- length(shade) + 1
## `bh' is the height of each box in the legend (including the
## separating whitespace
bh <- 0.95 * (0.95 - rtxtHeight) / (2 * len)
x.l <- 1000 * 1.05
x.r <- 1000 * (1.05 + 0.7 * rtxtWidth)
y.t <- 1000 * rev(seq(from = 0.95, by = - bh, length = 2 * len))
y.b <- y.t - 1000 * 0.8 * bh
ltype <- c(rep(2, len), rep(1, len))
for(i in 1 : (2 * len)) {
polygon(c(x.l, x.r, x.r, x.l),
c(y.b[i], y.b[i], y.t[i], y.t[i]),
col = color[i],
lty = ltype[i])
}
brks <- round(breaks, 2)
y.m <- y.b + 1000 * 0.4 * bh
text(1000 * (1.05 + rtxtWidth), y.m,
c(paste("<", brks[2], sep = ""),
paste(brks[2 : (2 * len - 1)],
brks[3 : (2 * len)],
sep = ":"),
paste(">", brks[2 * len], sep = "")),
srt = 90, cex = cex)
}
if (!is.null(main) || !is.null(xlab) || !is.null(ylab))
title(main, xlab=xlab, ylab=ylab, ...)
# compute margins
top.margin = right.margin = 0
for(k in which(dir == "v")) {
top.margin = top.margin + max(strheight(label[[k]],cex=cex))
# right.margin must accommodate half of the last label
# (in the worst case)
w = strwidth(label[[k]],cex=cex)
right.margin = max(right.margin,w[length(w)]/2)
}
# 0.05 in between text and plot
top.margin = top.margin + 0.05/par("pin")[2]*diff(par("usr")[3:4])
left.margin = 0
for(k in which(dir == "h")) {
if(rotate) {
h = max(strheight(label[[k]],cex=cex,units="inch"))
h = h/par("pin")[1]*diff(par("usr")[1:2])
left.margin = left.margin + max(h)
} else {
w = strwidth(label[[k]],cex=cex)
left.margin = left.margin + max(w)
}
}
left.margin = left.margin + 0.05/par("pin")[1]*diff(par("usr")[1:2])
# x2 is not exactly 1000 to accommodate labels which run off the end
mosaic.cell(Ind,
x1=left.margin, y1=1,
x2=1000-right.margin, y2=1000-top.margin,
space/100, dir,
color, par("usr")[1], par("usr")[4],
maxdim= apply(as.matrix(Ind[,1:dimd]), 2, max),
currlev= 1, label, shade)
}
# extra routines for multivariate analysis
# Tom Minka 12/3/01
eigen2.top <- function(A,B,n=1,x,tol=1e-10) {
d <- nrow(A)
if(missing(x)) x <- array(rnorm(d),c(d,1))
U <- chol(B)
for(iter in 1:100) {
x0 <- x
Ax0 <- A %*% x0
x <- backsolve(U,Ax0,transpose=T)
x <- backsolve(U,x)
x <- x/sqrt(sum(x*x))
if(max(abs(x - x0)/max(abs(x))) < tol) break
}
e <- c()
for(i in 1:ncol(x)) {
xAx <- t(x[,i]) %*% A %*% x[,i]
xBx <- t(x[,i]) %*% B %*% x[,i]
e[i] <- xAx/xBx
}
list(values=e,vectors=x)
}
solve.chol <- function(a,b) {
ch <- chol(a)
backsolve(ch,backsolve(ch,b,transpose=T))
}
# returns x which solves a*x = b*x*eigenvalues
# k is desired number of eigenvectors (optional)
eigen2 <- function(a,b,orth=NULL,k) {
# solve(b,a) is same as inv(b)*a
#iba <- solve.chol(b,a)
iba <- solve(b,a)
if(is.null(orth)) e <- eigen(iba,symmetric=F)
else {
if(is.vector(orth)) orth <- array(orth,c(length(orth),1))
n <- nrow(orth)
orth.k <- ncol(orth)
orth.m <- eye(n) - (orth %*% t(orth))
x <- orth.m %*% iba %*% orth.m
e <- eigen(x,symmetric=F)
# remove zero eigenvalues from result
e$vectors <- e$vectors[,1:(ncol(x)-orth.k),drop=F]
e$values <- e$values[1:(ncol(x)-orth.k)]
}
if(!missing(k)) {
e$vectors <- e$vectors[,1:k,drop=F]
e$values <- e$values[1:k,drop=F]
}
# remove trivial imaginary parts
e$vectors <- Re(e$vectors)
e$values <- Re(e$values)
e
}
ml.cov <- function(x) {
n <- nrow(x)
v <- cov(x)
if(n == 1) v <- array(0,dim(v))
# avoid singularity
condition.matrix(v,n)
}
condition.matrix <- function(x,n=100) {
#a <- max(eigen(v,only=T)$values)
a = mean(diag(x))
(x*(n-1) + eye(ncol(x))*a*1e-5)/n
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.