# functions to reduce dimensionality
#' Project data into fewer dimensions
#'
#' Reduces the dimensionality of a data set.
#'
#' @param x a data frame
#' @param w a matrix with named rows and columns
#' @details
#' Each column of \code{w} specifies a new variable, which is to be
#' constructed by combining existing variables according to the given weights.
#' @return
#' A data frame where the variables named in the rows of \code{w} are
#' replaced by new variables named in the columns of \code{w}.
#' Other variables are left unchanged.
#' @author Tom Minka
#' @seealso
#' \code{\link{pca}}, \code{\link{projection}}, \code{\link{plot_axes}}
#' @examples
#' data(iris)
#' w = projection(iris, k=2)
#' # w only involves the continuous attributes
#' # the new variables are h1 and h2
#' x = project(iris, w)
#' # in RStudio use dev.new()
#' color.plot(x)
#' plot_axes(w)
#' @export
project <- function(x,w) {
pred <- rownames(w)
xw <- data.frame(data.matrix(x[pred]) %*% w)
other <- setdiff(colnames(x),pred)
if(length(other) > 0) cbind(xw, x[other])
else xw
}
slice <- function(x,w,b,k=1/4) {
# returns a subset of x containing k percent of the points
# where w'x is near b
px = project(x,w)[,1]
k = ceiling(k*length(px))
i = order(abs(px - b))[1:k]
x[i,]
}
standardize.projection <- function(w) {
# break symmetries by making early dimensions positive
s = (nrow(w):1) %*% w
for(j in 1:ncol(w)) {
if(s[j] < 0) w[,j] <- -w[,j]
}
#print(t(w)%*%w)
w
}
#' Principal Component Analysis
#'
#' Computes a projection matrix that preserves spread.
#'
#' @param x a data frame.
#' @param k the number of dimensions to project down to.
#' @details
#' The projection is chosen so that the projected data is as "spread out"
#' as possible, measured according to the determinant of the covariance
#' matrix.
#' This turns out to be the top \code{k} eigenvectors of the data
#' covariance matrix.
#'
#' The projection is "stabilized" so that small changes in the data do
#' not cause sign flips in the projection.
#' @return
#' A matrix with named rows matching the numeric columns of \code{x} and
#' columns named \code{h1}, ..., \code{hk}.
#' Each column denotes a new dimension to be obtained as a linear combination
#' of the numeric variables in \code{x}.
#'
#' @author Tom Minka
#' @seealso \code{\link{project}}, \code{\link{projection}},
#' \code{pca} in the \code{multiv} package
#' @examples
#' data(Housing)
#' w <- pca(HousingT, k=2)
#' dev.new()
#' plot(project(HousingT, w), asp=1)
#' plot_axes(w)
#' @export
pca <- function(x,k=1,...) {
x <- as.data.frame(x)
x <- data.matrix(x[sapply(x,is.numeric)])
s <- svd(x)
cat("R-squared =",format(sum(s$d[1:k]^2)/sum(s$d^2)),"\n")
w <- s$v[,1:k]
if(k == 1) dim(w) <- c(length(w),1)
rownames(w) <- colnames(x)
colnames(w) <- paste("h",1:ncol(w),sep="")
standardize.projection(w)
}
pca2 <- function(x,k=1) {
# PCA vis Roweis's EM algorithm
# takes O(dnk) time
x <- as.data.frame(x)
x <- data.matrix(x[sapply(x,is.numeric)])
x <- t(x)
d <- nrow(x)
n <- ncol(x)
w <- array(rnorm(d*k),c(d,k))
# EM
for(iter in 1:100) {
old.w <- w
if(d >= n) {
# h is k by n
# flops is kdk + kdn + kkn + dnk + knk + kkn = O(dnk)
h <- solve.chol(t(w) %*% w, t(w) %*% x)
w <- x %*% t(solve.chol(h %*% t(h), h))
} else {
# flops is kdk + kdn + kkd + knd + knk + kkd = O(dnk)
h <- solve.chol(t(w) %*% w, t(w)) %*% x
w <- t(solve.chol(h %*% t(h), h %*% t(x)))
}
if(max(abs(w - old.w)) < 1e-5) break
}
if(iter == 100) warning("not enough iterations")
# postprocessing
w <- qr.Q(qr(w))
wx <- t(w) %*% x
s <- svd(wx)
cat("R-squared =",format(sum(s$d^2)/sum(x*x)),"\n")
w <- w %*% s$u
rownames(w) <- rownames(x)
colnames(w) <- paste("h",1:ncol(w),sep="")
standardize.projection(w)
}
#' Discriminative projection
#'
#' Finds a projection matrix that separates data groups.
#'
#' @param x a data frame of variables to project.
#' @param y a factor or numeric vector which defines the groups to separate.
#' If \code{y} is not given, it is
#' taken to be the response variable of \code{x} (the last column if
#' \code{x} is not a \code{model.frame}).
#' @param k the number of dimensions to project \code{x} down to.
#' @param given A matrix specifying axes to avoid. The projection matrix
#' will be orthogonal to \code{given}.
#' @param type see below.
#' @param ... additional parameters depending on \code{type}.
#' @details
#' This function only uses the second-order statistics of the data (means
#' and covariances of the groups).
#'
#' If \code{type="m"}, the within-group covariances are assumed equal
#' and the projection will try to separate the projected means.
#'
#' If \code{type="v"}, the within-group means are assumed equal
#' and the projection will try to separate the projected covariances.
#'
#' If \code{type="mv"}, the projection will try to separate the projected
#' means and covariances, by maximizing the divergence between the
#' projected classes (as Gaussians).
#'
#' If \code{y} is a numeric vector, overlapping classes are defined by
#' grouping data points with similar values of \code{y}.
#' The optional argument \code{span} controls how big the classes will
#' be (as a percentage of the dataset), and \code{res} controls the
#' amount of overlap. The total number of classes will be
#' \code{res}/\code{span}. The default values are usually acceptable.
#'
#' The projection is "stabilized" so that small changes in the data do
#' not cause sign flips in the projection.
#' @return
#' A matrix suitable for input to \code{\link{project}},
#' with named rows matching columns of \code{x} and
#' columns named \code{h1}, ..., \code{hk}.
#' Each column denotes a new dimension to be obtained as a linear combination
#' of the variables in \code{x}.
#' @author Tom Minka
#' @references
#' m-projection is Fisher's linear discriminant analysis.
#' mv-projection is heteroscedastic discriminant analysis:
#'
#' N. Kumar and A.G. Andreou.
#' Heteroscedastic discriminant analysis and
#' reduced rank HMMs for improved speech recognition.
#' \emph{Speech Communication} 26: 283-297, 1998.
#'
#' The case when \code{y} is numeric is sliced inverse
#' regression:
#'
#' K.-C. Li. Sliced inverse regression for dimension reduction.
#' \emph{Journal of the American Statistical Association} 86(414):
#' 316-327, 1991.
#' @seealso \code{\link{project}},\code{\link{pca}}
#' @examples
#' # illustrate difference between (m,v,mv)
#' library(MASS)
#' m1 <- c(6,6)
#' v1 <- array(c(2,1.9,1.9,2),c(2,2))
#' #v1 <- array(c(1,0,0,1),c(2,2))
#' x1 <- mvrnorm(100,m1,v1)
#' m2 <- c(0,0)
#' v2 <- array(c(20,0,0,10),c(2,2))
#' x2 <- mvrnorm(300,m2,v2)
#' x = as.data.frame(rbind(x1,x2))
#' y = factor(c(rep(1,nrow(x1)),rep(2,nrow(x2))))
#' plot(x[,1],x[,2],col=1,xlab="",ylab="",asp=1)
#' points(x2[,1],x2[,2],col=2)
#' w = projection(x,y,type="m")
#' abline(0,w[2]/w[1],col=3)
#' w = projection(x,y,type="v")
#' abline(0,w[2]/w[1],col=4)
#' w = projection(x,y,type="mv")
#' abline(0,w[2]/w[1],col=5)
#' my.legend(1,c("m","v","mv"),col=3:5,lty=1)
#'
#' # regression projection
#' x1 <- 2*runif(200)-1
#' x2 <- 2*runif(200)-1
#' y <- x1^2/2 + x2^2
#' x <- data.frame(x1,x2)
#' # in RStudio use dev.new()
#' color.plot(x[,1],x[,2],y)
#' w = projection(x,y)
#' abline(0,w[2]/w[1],col=4)
#' @export
projection <- function(x,y=NULL,k=1,type=c("mv","m","v","nn"),...) {
if(is.null(y)) {
resp <- response.var(x)
y <- x[[resp]]
pred <- predictor.terms(x)
x <- x[pred]
}
type <- match.arg(type)
if(type == "nn") return(projection.nn(x,y,k,...))
if(ncol(x) > 500) stop("data has too many dimensions")
if(all(is.na(y))) {
# unsupervised case
#if(type == "m") return(pca(x,k=k))
s = projection.stats.unsup(x,type,...)
va <- s[[1]]
vs <- s[[2]]
ns <- s[[3]]
} else if(!is.factor(y)) {
# regression projection
s <- projection.stats.reg(x,y,...)
va <- s[[1]]
vs <- s[[2]]
ns <- s[[3]]
} else {
xs <- split(x,y)
vs <- lapply(xs,ml.cov)
ns <- lapply(xs,nrow)
va <- ml.cov(x)
}
w = projection.from.stats(va,vs,ns,k,type,...)
rownames(w) <- names(x)
colnames(w) <- paste("h",1:ncol(w),sep="")
standardize.projection(w)
}
projection.from.stats <- function(va,vs,ns,k,type=c("mv","m","v"),...) {
fun <- switch(type, m=projection.m.stats,
v=projection.cov.stats,
mv=projection.mv.stats)
fun(va,vs,ns,k,...)
}
slicing <- function(x,y=NULL,...) {
# best slice is worst projection
if(is.null(y)) {
resp <- response.var(x)
y <- x[[resp]]
pred <- predictor.terms(x)
x <- x[pred]
}
k = ncol(x)
w = projection(x,y,k=k,...)
w[,k,drop=F]
}
optimize.slice <- function(x,y,...) {
# returns the setting of b which gives best separation
# x must already be projected
if(missing(y)) {
resp <- response.var(x)
y <- x[[resp]]
pred <- predictor.terms(x)
x <- x[pred]
}
optimize.slice.reg(x,y,...)
}
# uses linear regression to get projection
projection.lm <- function(x,y,k=1,given=NULL) {
vxy <- cov(x,y)
vxx <- cov(x)
e <- eigen2(vxy %*% t(vxy),vxx,given)
w <- e$vectors[,1:k]
if(k == 1) dim(w) <- c(length(w),1)
if(is.null(given)) w
else cbind(given,w)
}
projection.stats.unsup <- function(x,type,span,nmax=40,...) {
d = ncol(x)
if(missing(span)) span = 2*d
subset = 1:nrow(x)
if(nrow(x) > nmax) subset = sample(1:nrow(x),nmax)
va = ml.cov(x[subset,])
vs = list()
ns = list()
if(type == "m") {
for(i in 1:length(subset)) {
vs[[i]] = diag(d)
ns[[i]] = 1
}
} else {
for(i in 1:length(subset)) {
# learn a covariance around x[i,]
v = diag(d)
for(iter in 1:100) {
# generally takes 10 iters
dis = sqdist(x[subset[i],],x,solve(v))
j = order(dis)[1:(span+1)]
if(iter > 1 && all(j == old.j)) break
old.j = j
v = ml.cov(x[j,])
}
vs[[i]] = v
ns[[i]] = length(j)
}
}
list(va,vs,ns)
}
midpoints <- function(x) (x[-length(x)]+x[-1])/2
# returns stats for a projection which separates x values leading to
# different y's
# treats it as a classification problem with n classes
# it is invariant to transformations of y
projection.stats.reg <- function(x,y,span=NULL,means=F,res=2,...) {
ny = length(unique(y))
# estimate covariance matrix about each point
if(is.null(span)) {
span <- min(c(5*ncol(x),ny/2))
#cat("projection.reg: using nearest ",span," responses\n")
} else {
span <- span*ny
}
vs <- list()
ms = list()
# ys is the "classes" of y to be separated
# res determines how many classes will be used
ys = midpoints(break.quantile(y,ny/span*res))
#print(length(ys))
#ys = y
ns = list()
for(i in seq(ys)) {
d <- abs(y - ys[i])
ord <- order(d)
my.span = span
while(my.span < length(d)) {
# include all points whose distance is within that of last neighbor
j <- which(d <= d[ord[my.span]] + eps)
#cat(i,"has",length(j),"neighbors\n")
vs[[i]] <- ml.cov(x[j,,drop=F])
if(sum(diag(vs[[i]])) > 0) break
my.span = my.span + 1
}
ns[[i]] = my.span
if(means) ms[[i]] = colMeans(x[j,,drop=F])
}
va <- ml.cov(x)
if(means) list(va,vs,ns,ms)
else list(va,vs,ns)
}
optimize.slice.reg <- function(x,y,...) {
s = projection.stats.reg(x,y,means=T,...)
va <- s[[1]]
vs <- s[[2]]
ns <- s[[3]]
ms = s[[4]]
optimize.slice.stats(va,vs,ns,ms)
}
optimize.slice.stats <- function(va,vs,ns,ms) {
ms = as.numeric(ms)
vs = as.numeric(vs)
a = as.numeric(ns)*log(vs/va)/vs
b = mean(ms)
# this iteration isn't quite right, but seems to work
repeat {
old.b = b
g = a*dnorm(b,ms,sqrt(vs))
b = sum(g*ms)/sum(g)
if(abs(b - old.b) < 1e-6) break
}
b
}
# returns a projection which separates the class means
projection.m.stats <- function(va,vs,ns,k=1,given=NULL,...) {
n <- accumulate(ns)
vw <- accumulate(zip(vs,ns))/n
e <- eigen2(va,vw,given)
i <- rev(order(e$values))[1:k]
w <- e$vectors[,i]
if(k == 1) dim(w) <- c(length(w),1)
if(is.null(given)) w
else cbind(given,w)
}
# returns a projection which separates the class covariances
projection.cov.stats <- function(va,vs,ns,k=1,given=NULL,...) {
if(length(vs) > 2) {
n <- accumulate(ns)
va <- accumulate(zip(vs,ns))/n
return(projection.mv.stats(va,vs,ns,k,given))
}
v1 <- vs[[1]]
v2 <- vs[[2]]
n <- accumulate(ns)
a <- ns[[1]]/n
if(T) {
tim <- system.time(e <- eigen2(v1,v2,given))[3]
#cat("time to compute",nrow(v1),"eigenvectors:",tim,"\n")
} else {
tim1 <- system.time(e1 <- eigen2.top(v1,v2,k))[3]
tim2 <- system.time(e2 <- eigen2.top(v2,v1,k))[3]
#cat("time to compute",2*k,"eigenvectors:",tim1+tim2,"\n")
e <- list(values=c(e1$values,1/e2$values),vectors=cbind(e1$vectors,e2$vectors))
}
lambda <- abs(e$values)
J <- a*log(lambda) - log(a*lambda + 1-a)
# keep eigenvectors with smallest J
i <- order(J)[1:k]
w <- e$vectors[,i]
if(k == 1) dim(w) <- c(length(w),1)
if(is.null(given)) w
else cbind(given,w)
}
score.mv <- function(w,va,vs,ns) {
n <- accumulate(ns)
J <- logdet(t(w) %*% va %*% w)
for(i in 1:length(vs)) {
J <- J - ns[[i]]/n*logdet(t(w) %*% vs[[i]] %*% w)
}
J
}
score.m <- function(w,va,vs,ns) {
n <- accumulate(ns)
J <- logdet(t(w) %*% va %*% w)
avg.v <- array(0,dim(va))
for(i in 1:length(vs)) {
avg.v <- avg.v + (ns[[i]]/n)*vs[[i]]
}
J - logdet(t(w) %*% avg.v %*% w)
}
vec <- function(x) {
if(is.list(x)) array(unlist(x),c(prod(dim(x[[1]])),length(x)))
else array(as.vector(x),c(length(x),1))
}
# returns a projection which separates the class means and covariances
projection.mv.stats <- function(va,vs,ns,k=1,given=NULL,joint=F,...) {
n <- accumulate(ns)
if(!joint && k > 1) {
# initialize with greedy solution
# find projections one at a time
w <- projection.mv.stats(va,vs,ns,k=1,given,...)
for(i in 1:(k-1)) {
w <- projection.mv.stats(va,vs,ns,k=1,given=w,...)
}
return(w)
} else {
# initialize with best of m,v
w1 <- projection.m.stats(va,vs,ns,k=k,given)
w1 <- w1[,ncol(w1)+1-(k:1),drop=F]
J1 <- score.mv(w1,va,vs,ns)
if(length(vs) == 2) {
w2 <- projection.cov.stats(va,vs,ns,k=k,given)
w2 <- w2[,ncol(w2)+1-(k:1),drop=F]
J2 <- score.mv(w2,va,vs,ns)
} else J2 <- -Inf
if(J2 > J1) w <- w2
else w <- w1
}
ns <- unlist(ns)
d <- nrow(va)
if(k == 1) vec.vs <- vec(vs)
# EM optimization
for(iter in 1:100) {
oldw <- w
vb <- array(0,dim(va))
if(k > 1) {
# slow
for(i in 1:length(vs)) {
wv <- det(t(w) %*% vs[[i]] %*% w)^(1/k)
vb <- vb + (ns[[i]]/n/wv)*vs[[i]]
}
} else {
# fast for k == 1
wv <- t(w %x% w) %*% vec.vs
vb <- vec.vs %*% vec(ns/n/wv)
dim(vb) <- c(d,d)
}
if(F) {
# check that objective always increases
sw <- if(is.null(given)) w else cbind(given,w)
print(score.mv(sw,va,vs,ns))
}
w <- eigen2(va,vb,given,k=k)$vector
# force vectors to be orthogonal
if(k > 1) w <- qr.Q(qr(w))
if(max(abs(w - oldw)) < 1e-5) break
}
#cat("projection.mv: ",iter," EM iterations\n")
if(is.null(given)) w
else cbind(given,w)
}
projection.glm <- function(fit,k=2,...) {
x <- model.frame(fit)
fw <- coef(fit)[-1]
fw <- fw/norm(fw)
projection(x,given=fw,k=k-1,...)
}
color.plot.project.glm <- function(fit,data,type="mv",col=3,coef=F,lwd=2,...) {
x <- model.frame(fit)
fw <- coef(fit)[-1]
a <- coef(fit)[1]/norm(fw)
fw <- fw/norm(fw)
w <- projection(x,k=1,given=fw,type=type)
a = a*w[1,1]/fw[1]
px = project(if(missing(data)) x else data, w)
color.plot(px,...)
abline(v=-a,col=col,lwd=lwd)
invisible(w)
}
#' Plot axes under projection
#'
#' Shows how unit vectors along the original dimensions appear
#' under a projection.
#' @param w a numeric array with two columns and named rows
#' @param col color of arrows
#' @param origin If T, arrows emerge from (0,0). If F, arrows emerge
#' from the center of the figure. If not given, arrows emerge from
#' (0,0) if (0,0) is visible, otherwise from the figure center.
#' @param keep a length in inches, below which an arrow is not plotted.
#' @details
#' Each row of the array specifies a location in the figure. This
#' location can be understood as the place where a unit vector
#' along one of the original dimensions would appear. An arrow is
#' drawn from the origin pointing toward this location and labeled with
#' the row name. The relative length of the arrow is determined by the
#' distance of the location from the origin, but all arrows are scaled as
#' much as possible, to reduce crowding, while remaining inside the
#' figure limits. Arrows which are very short are not plotted at all.
#' @return Uses \code{\link{arrows}} to draw arrows on top of the current plot.
#' @author{Tom Minka}
#' @seealso
#' \code{\link{project}}, \code{\link{pca}}, \code{\link{projection}}
#' @export
plot_axes <- function(w,col=2,origin=NULL,keep=NULL,top=NULL,
cex=par("cex"),labels,...) {
# labels is to prevent it going to text()
if(is.null(keep)) keep <- 0.2
usr <- par("usr")
# is the origin in the plot?
if(is.null(origin)) {
origin <- (usr[1] < 0) && (usr[2] > 0) && (usr[3] < 0) && (usr[4] > 0)
}
if(origin) m <- c(0,0)
else m <- c((usr[2]+usr[1])/2, (usr[4]+usr[3])/2)
width <- strwidth(rownames(w),cex=cex)/2
height <- strheight(rownames(w),cex=cex)/2
# "a" is scale factor to place text at arrow tips
a <- pmin(width/abs(w[,1]), height/abs(w[,2]))
# txt.w is the offset of the text from the arrow tip
txt.w <- w * rep.col(a,2)
xlim <- usr[1:2]-m[1]
# make room on left
xlim[1] <- xlim[1] + diff(xlim)/par("pin")[1]*0.02
xscale <- c()
# find biggest xscale so that
# xscale*w[,1] + txt.w[,1] - width > xlim[1] (w < 0)
# xscale*w[,1] + txt.w[,1] + width < xlim[2] (w > 0)
# ignoring constraints which cannot be satisfied
i <- which(w[,1] < 0)
if(length(i) > 0) {
xscale[1] <- min((xlim[1] + width[i] - txt.w[i,1])/w[i,1])
} else xscale[1] = Inf
i <- which(w[,1] > 0)
if(length(i) > 0) {
xscale[2] <- min((xlim[2] - width[i] - txt.w[i,1])/w[i,1])
} else xscale[2] = Inf
# if one of the sets is empty, the scale will be NA
xscale <- min(xscale,na.rm=T)
ylim <- usr[3:4]-m[2]
# make room for color key
ylim[2] <- ylim[2] - diff(ylim)/par("pin")[2]*0.06
yscale <- c()
# find yscale so that
# yscale*w[,2] + txt.w[,2] - height > ylim[1] (w < 0)
i <- (w[,2] < 0)
yscale[1] <- min((ylim[1] + height[i] - txt.w[i,2])/w[i,2])
i <- (w[,2] > 0)
yscale[2] <- min((ylim[2] - height[i] - txt.w[i,2])/w[i,2])
yscale <- min(yscale,na.rm=T)
# scale equally
xscale <- min(c(xscale,yscale))
if(xscale < 0) {
e = match.call()
e$origin = F
return(eval(e))
stop("cannot fit axis labels on the plot. try origin=F")
}
yscale <- xscale
w <- scale(w,center=F,scale=1/c(xscale,yscale))
if(!is.null(top)) {
# only plot longest arrows
wn = apply(w,1,norm)
i = rep(F,nrow(w))
i[rev(order(wn))[seq(top)]] = T
} else {
# only plot arrows of significant length
wi = w
wi[,1] = inchx(w[,1])
wi[,2] = inchy(w[,2])
wn = apply(wi,1,norm)
i <- (wn > keep)
}
if(any(!i)) cat("omitting arrow for",rownames(w)[!i],"\n")
if(!any(i)) { warning("all arrows omitted"); return() }
len <- min(par("pin"))/20
arrows(m[1], m[2], m[1]+w[i,1], m[2]+w[i,2], col=col, len=len)
w <- scale(w,center=-m,scale=F)
w <- w + txt.w
# note: rownames(w[i,]) does not work if i picks only one
text(w[i,1],w[i,2],labels=rownames(w)[i],col=col,cex=cex,...)
}
projection.nn <- function(x,y=NULL,k=1,...) {
if(is.null(y)) {
resp <- response.var(x)
y <- x[[resp]]
pred <- predictor.terms(x)
x <- x[pred]
}
# this is for y=NA
j = nearest.points(x)$which
# center only
x = scale(x,scale=F)
x = as.matrix(x)
xj = x[j,]
A = t(x) %*% xj
A = A + t(A)
xx = t(x) %*% x
A = A + xx - (t(xj) %*% xj)
# avoid singularity
A = A + eye(nrow(A))*mean(diag(A))*1e-5
s = eigen(A,symm=T)
w = s$vectors[,1:k,drop=F]
n = nrow(x)
for(iter in 1:10) {
old.w = w
A = array(0,c(ncol(x),ncol(x)))
d = (xj %*% w) - (x %*% w)
e = rowMeans(d*d)
s.sum = numeric(n)
# e[i] is the distance between projected i and j
for(ik in 1:nrow(x)) {
d = rep.row(x[ik,] %*% w, nrow(x)) - (x %*% w)
# ek[i] is the distance between projected i and k
ek = rowMeans(d*d)
# want difference to be large
ek = ek - e
# s[i] is weight for term (i,k)
s = 1/(1 + exp(ek))
xki = rep.row(x[ik,],n) - x
xki = scale.rows(xki,s)
A = A + (t(xki) %*% xki)
s.sum = s.sum + s
}
#print(s.sum)
cat("pts with most weight: ",order(s.sum),"\n")
xji = xj - x
xji = scale.rows(xji,s.sum)
A = A - (t(xji) %*% xji)
g = A %*% w
if(max(abs(w - old.w)) < 1e-4) break
}
print(iter)
rownames(w) <- colnames(x)
colnames(w) <- paste("h",1:ncol(w),sep="")
standardize.projection(w)
}
identify.data.frame <- function(x,n=1,index=F,...) {
resp = response.var(x)
pred = predictor.vars(x)[1]
i = identify(x[,pred],x[,resp],labels=rownames(x),n=n,...)
if(index) i else rownames(x)[i]
}
identify.formula <- function(fmla,data=parent.frame(),...) {
x = model.frame(fmla,data)
identify.data.frame(x,...)
}
#' @exportS3Method
scale.data.frame <- function(x,center=T,scale=T,exclude.response=F,...) {
# scales the numeric columns, leaving rest alone
a = attr(x,"terms")
i <- sapply(x,is.numeric)
if(exclude.response) {
resp = response.var(x)
j = which(colnames(x) == resp)
i[j] = F
}
if(length(center) == 1) center = rep(center,ncol(x))
if(length(scale) == 1) scale = rep(scale,ncol(x))
for(j in which(i)) {
x[,j] = scale.default(x[,j],center=center[j],scale=scale[j],...)
}
#x[i] = apply.df(x[i],function(y) scale(y,...))
#attr(x,"terms") = a
x
}
#' @exportS3Method
hist.data.frame <- function(x,breaks,layout,col="tomato",...) {
if(missing(layout)) layout <- auto.layout(length(x))
opar <- par(mfrow=layout)
on.exit(par(opar))
for(i in names(x)) {
if(is.numeric(x[[i]])) {
if(missing(breaks)) breaks <- max(2,nclass.Sturges(x[[i]]))
hist(x[[i]],xlab=i,main="",breaks=breaks,col=col,...)
} else {
barplot(table(x[[i]]),xlab=i,legend=F,...)
}
}
}
factor.dichotomy <- function(f,lev,label=NULL) {
# lev can be a vector of levels
if(is.null(label)) label = paste(lev,collapse=",")
labels <- c(label,paste("NOT",label))
fd <- rep(labels[2],length(f))
if(T) {
fd[f %in% lev] <- labels[1]
} else {
for(one.lev in lev) {
fd[f == one.lev] <- labels[1]
}
}
fd <- factor(fd,levels=labels)
names(fd) <- names(f)
fd
}
factor.isolate <- function(f,i) {
# add a new level containing only i
lev = levels(f)
nam <- names(f)
f <- as.character(f)
names(f) <- nam
f[i] <- i
lev = intersect(lev,unique(f))
factor(f,levels=c(lev,i))
}
separate.level <- function(x,lev,type="mv",layout,keep=NULL,identify.flag=F,axes=F,...) {
# one vs rest
# only makes sense if length(lev) > 2
resp <- response.var(x)
y <- x[[resp]]
if(missing(lev)) lev <- levels(y)
if(missing(layout)) layout <- auto.layout(length(lev))
# must do this or mfrow will clobber cex (bug in par?)
wpar <- par(cex=par("cex"))
opar <- par(mfrow=layout)
par(wpar)
on.exit(par(opar,wpar))
for(k in 1:length(lev)) {
x[[resp]] <- factor.dichotomy(y,lev[k])
w <- projection(x,k=2,type=type)
px <- project(x,w)
color.plot(px,axes=axes,...)
plot_axes(w,origin=F,keep=keep,col=3)
if(identify.flag) {
h <- identify(px,plot=F)
if(length(h) > 0) text(px[h,1],px[h,2],rownames(px)[h],cex=0.9)
}
}
}
separate.pairwise <- function(x,level,type="mv",layout,identify.flag=F,axes=F,...) {
# level vs each
resp <- response.var(x)
y <- x[[resp]]
lev <- setdiff(levels(y),level)
if(missing(layout)) layout <- auto.layout(length(lev))
opar <- par(mfrow=layout)
on.exit(par(opar))
for(k in 1:length(lev)) {
i <- (y == lev[k]) | (y == level)
xk <- x[i,]
xk[[resp]] <- factor(xk[[resp]],levels=c(level,lev[k]))
w <- projection(xk,k=2,type=type)
px <- project(xk,w)
color.plot(px,axes=axes,...)
plot_axes(w,origin=F,col=3)
if(identify.flag) {
h <- identify(px,plot=F)
if(length(h) > 0) text(px[h,1],px[h,2],rownames(px)[h],cex=0.9)
}
}
}
logdet <- function(x) {
2*sum(log(diag(chol(x))))
}
separation <- function(x,type="mv") {
x <- as.data.frame(x)
y <- x[[response.var(x)]]
xp <- x[predictor.vars(x)]
xs <- split(xp,y)
vs <- lapply(xs,ml.cov)
ns <- lapply(xs,nrow)
n <- nrow(x)
if(T) {
va <- ml.cov(xp)
J <- logdet(va)
for(i in 1:length(ns)) {
J <- J - ns[[i]]/n*logdet(vs[[i]])
}
} else {
# loop pairs
r <- nchoosek(length(xs),2)
J <- 0
for(i in 1:length(r)) {
ri <- r[[i]]
r1 <- ri[1]
r2 <- ri[2]
n1 <- ns[[r1]]
n2 <- ns[[r2]]
v1 <- vs[[r1]]
v2 <- vs[[r2]]
if(type == "m") {
v1 <- (n1*v1 + n2*v2)/(n1+n2)
v2 <- v1
}
v12 <- ml.cov(rbind(xs[[r1]],xs[[r2]]))
Ji <- n1*logdet(v1) + n2*logdet(v2) - (n1+n2)*logdet(v12)
J <- J - exp(Ji/2/(n1+n2))
}
}
J
}
nchoosek <- function(n,k) {
if(k == 1) return(as.list(1:n))
r <- list()
for(i in k:n) {
r <- c(r,lapply(nchoosek(i-1,k-1), function(x) c(i,x)))
}
r
}
simple.projection <- function(d,i) {
if(length(d) == 1) {
d.names <- as.character(1:d)
} else {
d.names <- d
d <- length(d)
}
w <- array(0,c(length(d.names),length(i)),list(d.names,i))
for(j in 1:ncol(w)) w[i[j],j] <- 1
w
}
score.simple.stats <- function(va,vs,ns,k=1,type="mv",...) {
if(type == "v") {
n <- accumulate(ns)
va <- accumulate(zip(vs,ns))/n
}
pred <- colnames(va)
f <- nchoosek(length(pred),k)
r <- empty_data_frame(c(paste("Var",1:k,sep=""),"Score"))
for(i in 1:length(f)) {
p <- pred[f[[i]]]
w <- simple.projection(pred,p)
ri <- c(as.list(p), list(score.mv(w,va,vs,ns)))
names(ri) <- names(r)
r <- rbind_extend(r,data.frame(ri))
}
sort.data.frame(r)
}
# returns a frame of scores for each feature combination
score.features <- function(x,y,type=c("mv","m","v","r","lm"),...) {
x <- as.data.frame(x)
if(missing(y)) {
resp <- response.var(x)
y <- x[[resp]]
pred <- predictor.vars(x)
x <- x[pred]
}
if(missing(type)) {
if(is.factor(y)) type <- "mv"
else type <- "r"
} else type <- match.arg(type)
if(type == "r") {
s <- projection.stats.reg(x,y,...)
va <- s[[1]]
vs <- s[[2]]
ns <- s[[3]]
} else {
xs <- split(x,y)
vs <- lapply(xs,ml.cov)
ns <- lapply(xs,nrow)
va <- ml.cov(x)
}
score.simple.stats(va,vs,ns,type=type,...)
}
top.features <- function(x,r,m=4,layout,type="mv",...) {
#r <- score.features(x,k)
k <- ncol(r)-1
pred <- predictor.vars(x)
if(missing(layout)) layout <- auto.layout(m)
opar <- par(mfrow=layout)
on.exit(par(opar))
r <- r[nrow(r)+1-(1:m),]
for(i in 1:m) {
names <- as.vector(as.matrix(r[i,1:k]))
w <- simple.projection(pred,names)
xw <- project(x,w)
if(k > 2) {
w <- projection(xw,k=2,type=type)
xw <- project(xw,w)
}
color.plot(xw,...)
if(k > 2) plot_axes(w)
}
}
outliers <- function(x,p=0.05,names=T) {
x <- x[sapply(x,is.numeric)]
if(ncol(x) >= nrow(x)) {
warning("more variables than cases")
v <- diag(sd(x)^2)
} else v <- cov(x)
D2 <- mahalanobis(x,mean(x),v)
pv <- 1-pchisq(D2,df=ncol(x))
r <- data.frame(D2=D2,"p.value"=pv,row.names=rownames(x))
r <- sort.data.frame(r)
i <- (r$p.value < p)
if(T) {
#qqplot(qchisq(ppoints(length(D2)), df=ncol(x)), D2)
if(any(i)) print(r[i,])
}
nam <- rownames(r)[i]
if(names) nam else match(nam,rownames(x))
}
separate <- function(x,i,label=NULL,axes=F,...) {
# i can be a vector of integers or row names
if(is.integer(i)) {
f <- factor.dichotomy(seq(nrow(x)),i,label=label)
} else {
f <- factor.dichotomy(rownames(x),i,label=label)
}
sx <- scale(as.data.frame(x))
w <- projection(sx,f,k=2,type="m")
mar = par("mar")
on.exit(par(mar=mar))
color.plot(project(sx,w),f,axes=axes,...)
plot_axes(w,col=3,origin=F,...)
# clean up after creating sx
rm(sx)
gc()
sort(w[,1])
}
residuals.pca <- function(x,w) {
pred <- rownames(w)
y <- x[pred]
m <- apply(y,2,mean)
y <- sweep(y,2,m)
y <- data.frame((data.matrix(y) %*% w) %*% t(w))
y <- sweep(y,2,-m)
x[pred] <- x[pred] - y
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.