#######################################################################
######################### scaling #################################
#######################################################################
# undoes R 'scale()'
# reverses scaling on scaledx, dividing its columns by sds and adding
# ctrs; if either of the latter 2 is NULL, it is obtained via attr(), on
# the assumption that scaledx was produced from x by scale() or similar
# returns the original x; if scaledx
unscale <- function(scaledx,ctrs=NULL,sds=NULL) {
if (is.null(ctrs)) ctrs <- attr(scaledx,'scaled:center')
if (is.null(sds)) sds <- attr(scaledx,'scaled:scale')
origx <- scaledx
for (j in 1:ncol(scaledx)) {
origx[,j] <- origx[,j] * sds[j]
origx[,j] <- origx[,j] + ctrs[j]
}
origx
}
# scale to [0,1]
# arguments:
# m: a vector or matrix
# scalePars: if not NULL, a 2-row matrix, with column i storing
# the min and max values to be used in scaling column i of m;
# typically, one has previously called mmscale() on a dataset and
# saved the resulting scale parameters, and we want to use those
# same scale parameters on new data
# p: if m is a vector, specify the number of columns it should
# have as a matrix; code will try to take care of this by itself
# if p is left at NULL
# value: a matrix, with column i consisting of the scaled version
# of column i of m, and attribute as in scalePars (either copied from
# the latter or if null, generated fresh
mmscale <- function (m,scalePars=NULL,p=NULL)
{
# much of this code involves cases in which we have a vector but it
# is to be treated as a matrix
if (is.vector(m) && is.null(scalePars) && is.null(p))
stop('specify argument p')
if (is.null(p)) {
if (!is.null(scalePars))
p <- ncol(scalePars)
else p <- ncol(m)
}
if (is.vector(m))
m <- matrix(m, ncol = p)
if (is.null(scalePars)) {
rngs <- apply(m, 2, range)
mins <- rngs[1, ]
maxs <- rngs[2, ]
}
else {
mins <- scalePars[1, ]
maxs <- scalePars[2, ]
rngs <- scalePars
}
ranges <- maxs - mins
tmm <- function(i) m[, ] <- (m[, i] - mins[i])/ranges[i]
m <- sapply(1:ncol(m), tmm)
if (is.vector(m))
m <- matrix(m, ncol = p)
attr(m, "minmax") <- rngs
m
}
#######################################################################
################### factors and dummy variables #####################
#######################################################################
# these routines are useful in that some regression packages insist that
# predictor be factors, while some require dummy variables
# for each column in dfr, if factor then replace by dummies,
# else just copy column; if omitLast, then dummy for last level of
# factor is not included in output
# a key point is that, for later prediction after fitting a model, one
# needs to use the same transformations; otherwise, the data to be
# predicted may be missing a level of some factor; this of course is
# especially true if one is predicting a single case
# thus the factor names and levels are saved in attributes, and can be
# used as input, via factorInfo and factorsInfo for factorToDummies()
# and factorsToDummies():
# factorToDummies() outputs and later inputs factorInfo
# factorsToDummies() outputs and later inputs factorsInfo
#################### factorsToDummies() ######################
# inputs a data frame, outputs same but with all factor cols expanded to
# dummies
# arguments
# dfr: a data frame
# omitLast: if TRUE, make m-1 dummies for an m-level factor
# factorsInfo: factor levels found earlier, R list, element nm
# is levels of factor named nm
# dfOut: if TRUE, output a data frame rather than a matrix
# if the input has cols not numeric or factor, ftn will quit
factorsToDummies <- function(dfr,omitLast=FALSE,factorsInfo=NULL,
dfOut=FALSE)
{
if (is.factor(dfr)) dfr <- as.data.frame(dfr)
# for now, no input cols other than numeric, factor allowed
## nnf <- function(i) (!is.numeric(dfr[,i]) && !is.factor(dfr[,i]))
## notnumfact <- sapply(1:ncol(dfr),nnf)
## if (any(notnumfact))
## stop('non-numeric, non-factor columns encountered')
outDF <- NULL
nullFI <- is.null(factorsInfo)
if (nullFI) factorsInfoOut <- list()
for (i in 1:ncol(dfr)) {
dfi <- dfr[,i]
if (length(levels(dfi)) == 1 && length(dfi) > 1) {
msg <- paste(names(dfr)[i],'constant column: ',i)
warning(msg)
}
colName <- names(dfr)[i]
if (!is.factor(dfi)) {
if (!is.numeric(dfi))
dfi <- as.factor(dfi)
outDF <- cbind(outDF,dfi)
colnames(outDF)[ncol(outDF)] <- colName
} else {
dumms <- factorToDummies(dfi,colName,omitLast=omitLast,
factorInfo=factorsInfo[[colName]])
if (nullFI) {
factorInfo <- attr(dumms,'factorInfo')
factorsInfoOut[[colName]] <- factorInfo
}
outDF <- cbind(outDF,dumms)
}
}
res <- if (!dfOut) as.matrix(outDF) else outDF
if (nullFI) {
attr(res,'factorsInfo') <- factorsInfoOut
} else
attr(res,'factorsInfo') <- factorsInfo
res
}
#################### factorToDummies() ######################
# converts just a single factor
# def of omitLast is in comments above
# factorInfo is used if we are converting a factor that we've already
# converted on previous data; this argument is used to ensure that the
# conversion on the new data is consistent with the old, important for
# prediction settings
# easier to have both f, fname required
factorToDummies <- function (f,fname,omitLast=FALSE,factorInfo=NULL)
{
n <- length(f)
fl <- levels(f)
if (!is.null(factorInfo)) {
fn <- factorInfo$fname
if (fn != fname) stop('mismatched fname')
ol <- factorInfo$omitLast
if (ol != omitLast) stop('mismatChed omitLast')
fullLevels <- factorInfo$fullLvls
if (length(setdiff(fl,fullLevels)))
stop(paste('new factor level found'))
} else fullLevels <- fl
useLevels <-
if(omitLast) fullLevels[-length(fullLevels)] else fullLevels
ndumms <- length(useLevels)
dms <- matrix(nrow = n, ncol = ndumms)
for (i in 1:ndumms) dms[, i] <- as.integer(f == useLevels[i])
colnames(dms) <- paste(fname,'.', useLevels, sep = "")
tmp <- list()
tmp$fname <- fname
tmp$omitLast <- omitLast
tmp$fullLvls <- fullLevels # all levels even last
attr(dms,'factorInfo') <- tmp
dms
}
# in predicting after fitting a regression model, a factor that was fit
# may now encounter new levels, preventing prediction; this function
# reports which factors/levels, if any, are new; it should be called
# before calling predict()
# arguments:
# info1: one of
# the "old" data frame, used in fit (case A);
# an R list of levels for each factor in that frame
# (case B); or
# output of a qe*() call on that frame that
# has a 'factorLevelsPresent' component
# (future enhancement) (case C)
# data2: "new" data frame, used in prediction
# value:
# vector of row numbers in which a new factor level was found
checkNewLevels <- function(info1,data2)
{
tmp <- sapply(data2,is.factor)
factorNames <- names(data2)[tmp]
if (is.data.frame(info1)) { # case A
tmp <- sapply(info1,is.factor)
tmp <- names(info1)[tmp]
levelsPresent1 <- lapply(info1[tmp],function(t) unique(t))
} else if (setequal(names(info1),names(data2))) { # case B
levelsPresent1 <- info1 # case C
} else { # case C
levelsPresent1 <- info1$factorLevelsPresent
}
res <- NULL
for (nm in factorNames) {
tmp <- setdiff(levels(data2[[nm]]),levelsPresent1[[nm]])
if (length(tmp) > 0) {
matches <- which(data2[[nm]] %in% tmp)
res <- union(res,matches)
}
}
res
}
#################### dummiesToFactor() ######################
# makes a factor from a single related set of dummies dms; if the
# variable has k levels, inclLast = FALSE means there are only k-1
# dummies provided, so the k-th must be generated
dummiesToFactor <- function(dms,inclLast=FALSE)
{
dms <- as.matrix(dms)
if (!inclLast) {
lastCol <- 1 - apply(dms,1,sum)
dms <- cbind(dms,lastCol)
}
where1s <- apply(dms,1,function(rw) which(rw == 1))
colnames(dms) <- paste0('V',1:ncol(dms),sep='')
nms <- colnames(dms)
f <- nms[where1s]
as.factor(f)
}
#################### dummiesToInt() ######################
dummiesToInt <- function(dms,inclLast=FALSE) {
as.numeric(dummiesToFactor(dms=dms,inclLast=inclLast))
}
# maps a factor to 0,1,2,...,m-1 where m is the number of levels of f;
# saves the levels in an attribute, e.g. for use in a later predict()
# setting; if using earlierLevels, then f will be a character vector
# with values in earlierLevels
factorTo012etc <- function(f,earlierLevels=NULL) {
if (!is.null(earlierLevels)) {
checkOneValue <- function(val) which(val == earlierLevels) - 1
return(sapply(f,checkOneValue))
}
tmp <- as.numeric(f)-1
attr(tmp,'earlierLevels') <- levels(f)
tmp
}
#################### intToDummies() ######################
# inputs an integer vector x and creates dummies for the various values
intToDummies <- function(x,fname,omitLast=TRUE)
{
tmp <- as.factor(x)
factorToDummies(tmp,fname,omitLast=omitLast)
}
#################### charsToFactors() ######################
# inputs a data frame and converts all character columns to factors
charsToFactors <- function(dtaf)
{
for (i in 1:ncol(dtaf)) {
cli <- dtaf[,i]
if (is.character(cli)) {
dtaf[,i] <- as.factor(cli)
}
}
dtaf
}
#################### xyDataframeToMatrix() ######################
# inputs a data frame intended for regression/classification, with X in
# the first cols and Y in the last; converts all factors to dummies, and
# outputs a matrix; in creating dummies, r-1 are retained for r levels,
# except for Y
# see also toAllNumeric() below
xyDataframeToMatrix <- function(xy) {
p <- ncol(xy)
x <- xy[,1:(p-1)]
y <- xy[,p]
xd <- factorsToDummies(x,omitLast=TRUE)
yd <- factorToDummies(y,'y',omitLast=FALSE)
as.matrix(cbind(xd,yd))
}
#################### hasFactors() ######################
# x is a data frame; returns TRUE if at least one column is a factor
hasFactors <- function(x)
{
for (i in 1:ncol(x)) {
if (is.factor(x[,i])) return(TRUE)
}
FALSE
}
#################### hasCharacters() ######################
# dfr is a data frame; returns TRUE if at least one column is in character mode
hasCharacters <- function(dfr)
{
for (i in 1:ncol(dfr)) {
if (is.character(dfr[,i])) return(TRUE)
}
FALSE
}
#################### toSuperFactor() ######################
# say we have a factor f1, then encounter f2, with levels a subset of
# those of f1; we want to change f2 to have the same levels as f1; seems
# that this can NOT be done via levels(f2) <- levels(f1); typically used
# in predict() functions
toSuperFactor <- function(inFactor,superLevels)
{
inFactorChars <- as.character(inFactor)
extraChars <- setdiff(superLevels,inFactorChars)
nExtra <- length(extraChars)
if (nExtra == 0) return(inFactor)
newInFactorChars <- c(inFactorChars,extraChars)
nExtra <- length(extraChars)
tmp <- as.factor(newInFactorChars)
nTmp <- length(tmp)
start <- nTmp - nExtra + 1
end <- nTmp
tmp[-(start:end)]
}
#################### toSubFactor() ######################
# here we have a factor f with various levels, but want to lump all
# levelsl but the ones in saveLevels to a new level, lumpedLevel; the
# default for the latter is 'zzzOther', chosen to ensure that the lumped
# level is last
toSubFactor <- function(f,saveLevels,lumpedLevel='zzzOther')
{
lvls <- levels(f)
fChar <- as.character(f)
other <- setdiff(lvls,saveLevels)
whichOther <- which(fChar %in% other)
fChar[whichOther] <- lumpedLevel
as.factor(fChar)
}
#################### toAllNumeric() ######################
# change character variables to factors, then all factors to dummies,
# recording factorInfo for later use in prediction; put result in wm
# w: data frame
# factorsInfo: value found in a previous call
toAllNumeric <- function(w,factorsInfo=NULL)
{
if (hasCharacters(w)) {
stop('character variables currently not supported')
## w <- charsToFactors(w)
}
if (hasFactors(w)) {
wm <- factorsToDummies(w,omitLast=TRUE,factorsInfo=factorsInfo)
attr(wm,'factorsInfo')
} else {
wm <- w
}
wm
}
#######################################################################
################### misc. data frame/matrix ops ######################
#######################################################################
# multiply x[,cols] by vals, e.g. x[,cols[1]] * vals[1]
# code by Bochao Xin
multCols <- function(x,cols,vals) {
# tx <- t(x[,cols])
# x[,cols] <- t(tx*vals)
# for (i in 1:length(cols)) {
# cl <- cols[i]
# x[,cl] <- x[,cl] * vals[1,i]
# }
tx <- t(x[,cols])
x[,cols] <- t(tx*vals)
x
}
# check for constant cols
# d is a matrix or data frame; returns empty vector (i.e. length == 0)
# if no cols are constant, otherwise indices of those that are constant
constCols <- function(d) {
if (is.matrix(d)) d <- as.data.frame(d)
nDistinct <- sapply(lapply(d, unique), length)
return(which(nDistinct == 1))
}
# print a data frame row
catDFRow <- function(dfRow) {
for (i in 1:ncol(dfRow)) {
cat(as.character(dfRow[1,i]),' ')
}
}
# print the classes of a data frame
getDFclasses <- function(dframe) {
tmp <- sapply(1:ncol(dframe),function(i) class(dframe[,i]))
names(tmp) <- names(dframe)
tmp
}
# check whether all elements of a list, including a data frame, are
# numeric
allNumeric <- function(lst)
{
tmp <- sapply(lst,is.numeric)
all(tmp)
}
#######################################################################
###################### misc. lm() routines #######################
#######################################################################
# computes the standard error of the predicted Y for X = xnew
stdErrPred <- function(regObj,xnew) {
xx <- c(1,xnew) # the 1 accounts for the intercept term
xx <- as.numeric(xx) # in case xnew was a row in a data frame
as.numeric(sqrt(xx %*% vcov(regObj) %*% xx))
}
#######################################################################
###################### misc. graphics ################################
#######################################################################
# "3-D" graphs of (x,y,z), where (x,y) points are plotted in 2-D for
# various values of z; pts connected by lines, with z values displayed
# at the connection points; grouping is allowed, specified via a 4th
# column if desired
# arguments:
# xyz: matrix/df consisting of x, y and z above, possible a 4th; in
# latter case, xyz must be a data frame
# clrs: colors for the various lines; default uses heat.colors()
# xlim, ylim: as in R plot(); default uses largest ranges
# xlab,ylab: as in R plot()
# legendPos: first argument to legend(), e.g. 'topright'
xyzPlot <- function(xyz,clrs=NULL,cexText=1.0,
xlim=NULL,ylim=NULL,xlab=NULL,ylab=NULL,legendPos=NULL,plotType='l')
{
if (is.null(xlim)) xlim <- range(xyz[,1])
if (is.null(ylim)) ylim <- range(xyz[,2])
if (is.null(xlab)) xlab <- 'x'
if (is.null(ylab)) ylab <- 'y'
oneLine <- (ncol(xyz) == 3)
if (is.null(clrs)) {
if (oneLine) clrs <- 'black'
else clrs <- heat.colors(length(unique(xyz[,4])))
}
if (plotType == 'l')
# so that lines "move to the right," rather than a jumble
xyz <- xyz[order(xyz[,1]),]
nr <- nrow(xyz)
lineGrps <-
if (oneLine) list(1:nr)
else split(1:nr,xyz[,4])
nGrps <- length(lineGrps)
# plot(xyz[lineGrps[[1]],1:2],type=plotType,col=clrs[1],
plot(1,
xlim=xlim,ylim=ylim,xlab=xlab,ylab=ylab,cex=0.1)
### if (nGrps > 1)
for (i in 1:nGrps) {
if (plotType == 'l') {
lines(xyz[lineGrps[[i]],1:2],type='l',col=clrs[i])
}
else
points(xyz[lineGrps[[i]],1:2],col=clrs[i],cex=0.1)
}
for (i in 1:nGrps) {
lns <- xyz[lineGrps[[i]],]
text(lns[,1],lns[,2],lns[,3],cex=cexText,col=clrs[i])
}
# add legend
if (!oneLine && !is.null(legendPos)) {
legend(legendPos,legend=unique(xyz[,4]),col=clrs,lty=1)
}
}
# print current image to file
prToFile <- function (filename)
{
origdev <- dev.cur()
parts <- strsplit(filename,".",fixed=TRUE)
nparts <- length(parts[[1]])
suff <- parts[[1]][nparts]
if (suff == "pdf") {
pdf(filename)
}
else if (suff == "png") {
png(filename,bg='white')
}
else jpeg(filename)
devnum <- dev.cur()
dev.set(origdev)
dev.copy(which = devnum)
dev.set(devnum)
dev.off()
dev.set(origdev)
}
#######################################################################
###################### PCA routines ##################################
#######################################################################
#################### PCAwithFactors() ###############################
# allows use of prcomp() with data that may include R factors
# arguments:
# x: a data frame
# nComps: number of PC components to use
# value: object of class 'PCAwithFactors', with components as follows
# pcout: output of calling prcomp() on toAllNumeric(x)
# factorsInfo: attr from call to toAllNumeric(), if any; needed for
# predict.PCAwithFactors()
# preds: PCA version of x
# note: scaling will be applied, using mmscale()
PCAwithFactors <- function(x,nComps=ncol(x))
{
if (!is.data.frame(x)) stop('x must be a data frame')
namesOrigX <- names(x)
nColsOrigX <- ncol(x)
factorIdxs <- which(sapply(x,is.factor))
if (identical(factorIdxs,1:ncol(x)))
stop('case of all-factor data not supported yet')
xscale <- mmscale(x[,-factorIdxs])
minmax <- attr(xscale,'minmax')
x[,-factorIdxs] <- xscale
if (length(factorIdxs) > 0) {
x <- factorsToDummies(x)
factorsInfo <- attr(x,'factorsInfo')
}
pcout <- prcomp(x)
xpca <- predict(pcout,x)[,1:nComps]
res <- list(pcout=pcout,xpca=xpca,factorsInfo=factorsInfo,
namesOrigX=namesOrigX,nColsOrigX=nColsOrigX,factorIdxs=factorIdxs,
minmax=minmax,nComps=nComps)
class(res) <- 'PCAwithFactors'
res
}
# find PCA rep of newx; newx in original scale, output in scale of
# mmscale() + PCA
predict.PCAwithFactors <- function(object,newx)
{
if (!identical(names(newx),object$namesOrigX))
stop('column names mismatch')
factorIdxs <- object$factorIdxs
tmp <- as.matrix(newx[,-factorIdxs])
p <- object$nColsOrigX - length(factorIdxs)
newxscale <- mmscale(tmp,object$minmax,p=p)
if (nrow(newx) == 1) newxscale <- newxscale[1,]
newx[,-factorIdxs] <- newxscale
newx <- factorsToDummies(newx,factorsInfo=object$factorsInfo)
# unfortunately, cannot use predict.prcomp(), as it scales
preds <- as.matrix(newx) %*% object$pcout$rotation
preds[,1:object$nComps]
}
######################### doPCA() ##########################
# call prcomp(x,pcaProp), transform x to PCs, with the number of the
# latter being set according to the top pcaProp proportion of variance
doPCA <- function(x,pcaProp)
{
pcaout <- prcomp(x,scale.=TRUE)
xpca <- predict(pcaout,x)
xNames <- colnames(xpca)
pcVars <- pcaout$sdev^2
ncx <- ncol(xpca)
csums <- cumsum(pcVars)
csums <- csums/csums[ncx]
numPCs <- min(which(csums >= pcaProp))
xpca <- xpca[,1:numPCs]
newData <- as.data.frame(xpca)
names(newData) <- xNames[1:numPCs]
list(pcaout=pcaout,numPCs=numPCs,newData=newData)
}
#######################################################################
###################### misc. other #####################################
#######################################################################
###################### misc. list ops ################################
# assign list components to individual variables of the same names
# similar to unpack() in zeallot pkg
ulist <- function(lst)
{
nms <- names(lst)
if (any(nms == '')) stop('missing list name')
tmp <- substitute(for (nm in nms) assign(nm,lst[[nm]]))
eval(tmp,parent.frame())
}
####################### loss functions ###############################
# mean absolute prediction error
MAPE <- function(yhat,y)
{
if (!is.vector(y) || !is.vector(yhat))
stop('inputs must be vectors')
mean(abs(yhat-y))
}
# overall error rate
# either
# y is a vector of 0s and 1s,
# yhat a vector of estimated probabilities of 1
# or
# y is a vector of numeric class labels, starting at 1 or 0
# yhat is a matrix, with y[i,j] = prob of Y = j+startAt1-1
probIncorrectClass <- function(yhat,y,startAt1=TRUE
)
{
if (is.factor(y) || is.factor(yhat))
stop('vector/matrix inputs only')
if (is.vector(yhat)) {
yhat <- round(yhat)
return(mean(yhat != y))
}
classPred <- apply(yhat,1,which.max)
classActual <- y - startAt1 + 1
mean(classPred != classActual)
}
# proportion misclassified; deprecated in favor of probIncorrectClass
propMisclass <- function(y,yhat)
{
if (!is.vector(y) && !is.factor(y))
stop('predicted classes must be a vector or factor')
mean(y != yhat)
}
# included lossFtn choices are MAPE and probIncorrectClass; user may
# supply others
findOverallLoss <- function (regests, y, lossFtn = MAPE)
{
loss1row <- function(regestsRow) lossFtn(y, regestsRow)
apply(regests, 1, loss1row)
}
######################### other misc. ################################
# convenience wrapper for replicate; finds the means and standard errors
# and the nrep replication; toReplic can be a vector
# arguments:
# nrep: number of replications
# toReplic: expression to replicate; use braces and semicolons if
# more than one statement; must return either a number or a vector
# of numbers
# timing: if TRUE, apply system.time() to each replication, and
# calculate the mean elapsed time
# value:
# mean outcomes of the simulation, plus an attribute storing the
# associated standard errors
replicMeans <- function(nrep,toReplic,timing=FALSE) {
if (timing) {
tmp <- paste0('{tm <- system.time(z <- ',toReplic,'); ')
tmp <- paste0(tmp,'c(tm[3],z)}')
toReplic <- tmp
}
cmd <- paste0('replicate(',nrep,',',toReplic,')')
cmdout <- eval.parent(parse(text=cmd))
# cmdout <- eval(parse(text=cmd))
# if toReplic returns a vector, cmdout will be a matrix; to handle
# this, make it a matrix anyway
if (!is.matrix(cmdout)) cmdout <- matrix(cmdout,ncol=nrep)
meancmdout <- rowMeans(cmdout)
attr(meancmdout,'stderr') <- apply(cmdout,1,sd) / sqrt(nrep)
meancmdout
}
# convenience wrapper for cut()
# arguments:
# x: numeric vector
# endpts: endpoints for the desired intervals, treated as open on the
# left and closed on the right; to avoid NA values, make sure all
# of x is accommodated
# value:
# discrete version of x, with values 1,2,3,...; will have an R
# attribute, 'endpts', so as to remember which ones we used
discretize <- function(x,endpts)
{
xc <- cut(x,endpts,labels=1:(length(endpts)-1))
attr(xc,'endpts') <- endpts
xc
}
require(gtools)
# the problem with strsplit('a b') is that it yields (in [[1]]
# component) 'a','','b'; the version below doesn't give any empty
# strings
pythonBlankSplit <- function(s)
{
tmp <- strsplit(s,' ')[[1]]
tmp[tmp != '']
}
# use this after doing error checking, giving the user the choice of
# leaving, or continuing in the debugger
stopBrowser <- defmacro(msg,expr=
{
cat(msg,'\n')
d <- readline('hit Enter to leave, or d to enter debugger: ')
if (d == '') stop('')
browser()
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.