# two-dimensional visualization of the X data in classification problems,
# similar in spirit to ordinary PCA and t-sne, color coded by Y values
# (class IDs in classification case, subinternvals of Y in regression
# case)
# t-sne, e.g. in the Rtsne package, applied in dimension k, attempts to
# find a k-dimensional manifold for which most of the data are "near";
# for visualization purposes, typically k = 2, which is assumed here
# the idea here is to expand the data with polynomial terms, using
# getPoly(), then apply PCA to the result
# typically these methods are applied only to a subsample of the data,
# due both to the lengthy computation time and the "black screen
# problem" (having a lot of points fills the screen, rendering the plot
# useless)
# arguments:
# xy: data frame
# labels: if TRUE, last column is Y for a classification problem;
# must be an R factor, unless nIntervals is non-NULL, in
# which case Y will be discretized to make labels
# yColumn: the column of Y
# deg: degree of polynomial expansion
# maxInteractDeg: maximum combined degree for an interaction term
# scale: if TRUE, first call scale() on the X data
# nSubSam: number of rows to randomly select; 0 means get all
# nIntervals: in regression case, number of intervals to use for
# partioning Y range to create labels
# outliersRemoved: specify how many outliers to remove from
# the plot, calculated using mahalanobis distance. if
# outliersRemoved is between 0 and 1, a corresponding
# percentage of the data will be removed
# pcaMethod: specify how eigenvectors will be calculated, using
# prcomp or RSpectra
# bigData: use bigmemory package to store matrices if specified as true
# saveOutputs: specify the name of the file where the results will be saved.
# default file is 'lastPrVisOut'. set to the empty string to
# not save results.
# cex: argument to R plot(), controlling point size
# alpha: a number between 0 and 1 that can be used to specify transparency
# for alpha blending. If alpha is specified ggplot2 will be used to
# create the plot
prVis <- function(xy,labels=FALSE,yColumn = ncol (xy), deg=2,maxInteractDeg=deg,
scale=FALSE,nSubSam=0,nIntervals=NULL,
outliersRemoved=0,pcaMethod="prcomp",bigData=FALSE,
saveOutputs="lastPrVisOut",cex=0.5, alpha=0)
{
# safety check
if (!pcaMethod %in% c('prcomp','RSpectra'))
stop("pcaMethod should be either NULL, prcomp, or RSpectra")
if (yColumn > ncol(xy) || yColumn <= 0)
stop("The column specified is out of range")
if(outliersRemoved > 100)
stop("This is percentage-based (enter number between 0 to 100)")
#=======================data preprocessing stage========================
nrxy <- nrow(xy)
ncxy <- ncol(xy)
# yColumn option
if (labels && yColumn != ncxy) {
# swapping values
xy[,c(yColumn, ncxy)] <- xy[,c(ncxy, yColumn)]
# swapping column names
colnames(xy)[c(yColumn, ncxy)] <- colnames(xy)[c(ncxy, yColumn)]
}
# scale option
rns <- row.names(xy)
if (scale) {
if (labels) {
xy[,-ncxy] <- scale(xy[,-ncxy])
} else xy <- scale(xy)
row.names(xy) <- rns
}
# nSubSam option
if (nSubSam < nrxy && nSubSam > 0)
xy <- xy[sample(1:nrxy,nSubSam),]
# nIntervals option
if (labels) {
ydata <- xy[,ncxy]
yName <- colnames(xy)[ncxy]
if (is.null(nIntervals) && !is.factor(ydata))
stop('Y must be a factor for classif.; set nIntervals for regress.')
if (!is.null(nIntervals)) {
rng <- range(ydata)
increm <- (rng[2] - rng[1]) / nIntervals
ydata <- round((ydata - rng[1]) / increm)
ydata <- as.factor(ydata)
}
xdata <- xy[,-ncxy, drop=FALSE]
} else xdata <- xy
xName <- colnames(xdata)
remove(xy)
# after the preprocessing stage, ydata is a factor vector,
# xdata may be a dataframe with or without factor column(s)
# if xy is a dataframe
#=====================polynomial expansion + PCA========================
if (bigData) {
polyMat <- polyGet_big(xdata, deg, maxInteractDeg)
xdata <- predict(big_randomSVD(polyMat, k=2, ncores = nb_cores()))
colnames(xdata) <- c("PC1","PC2")
} else{
# $xdata should always be a matrix
polyMat <- getPoly(xdata, deg, maxInteractDeg)$xdata
if (pcaMethod == "prcomp") {
x.pca <- prcomp(polyMat,center=TRUE)
xdata <- x.pca$x[,1:2]
} else {
require(RSpectra)
x.cov <- cov(polyMat)
x.eig <- eigs(x.cov,2)
x.pca <- x.eig
xdata <- as.matrix(polyMat) %*% x.eig$vectors[,1:2]
colnames(xdata) <- c("PC1","PC2")
}
}
#===========================removing outliers===========================
if (outliersRemoved > 0 && outliersRemoved < 100){
nrxy <- nrow(xdata)
row.names(xdata) <- 1:nrxy
removeOutliers <- function(data, num) {
distances <- mahalanobis(data,colMeans(data),var(data))
names(distances) <- rownames(data)
sortedDistances <- sort(distances, decreasing=TRUE)
outliers <- names(sortedDistances)[1:num]
}
# remove outliers by class
if (labels){
names(ydata) <- 1:nrxy
partition <- tapply(rownames(xdata), ydata, c)
for (i in 1:length(partition)) {
xdatasub <- xdata[rownames(xdata) %in% partition[[i]],]
outliersToRemove <- floor(outliersRemoved * nrow(xdatasub) / 100)
if (outliersToRemove > 0) {
outliers <- removeOutliers(xdatasub, outliersToRemove)
xdata <- xdata[!rownames(xdata) %in% outliers,]
ydata <- ydata[!names(ydata) %in% outliers]
}
}
} else {
# percentage based outlier removal
outliersRemoved = floor(outliersRemoved * nrxy / 100)
if (outliersRemoved > 0) {
outliers <- removeOutliers(xdata, outliersRemoved)
# remove outliers
xdata <- xdata[!rownames(xdata) %in% outliers,]
}
}
}
#=====================producing plot + saving output====================
if (alpha) {
require(ggplot2)
if (labels) {
plotObject <- qplot(x=xdata[,1],y=xdata[,2],xlab="PC1",ylab="PC2",
alpha=alpha,col=ydata,size=I(cex))
} else {
plotObject <- qplot(x=xdata[,1],y=xdata[,2],xlab="PC1",ylab="PC2",
alpha=alpha,size=I(cex))
}
print(plotObject)
} else {
if (labels) {
plot(xdata, col=ydata, pch=15, cex=cex)
} else {
plot(xdata, pch=15, cex=cex)
}
}
if (saveOutputs != "" && !bigData){ # temporaily disable saving for bigData
if (labels)
outputList <- list(gpOut=polyMat,prout=x.pca,
colName=xName, yCol = ydata, yname=yName)
else
outputList <- list(gpOut=polyMat,prout=x.pca, colName=xName)
save(outputList,file=saveOutputs)
}
}
# intended to be used when a plot produced by prVis() is on the screen;
# chooses np points at random from the PCA output, writing their row
# numbers on the plot; these are the numbers from the full dataset, even
# if nSubSam > 0; the argument savedPrVisOut is the return value of
# prVis()
#
# arguments:
# np: the number of points to add row numbers to. if no value of np is
# provided, rownumbers will be added to all datapoints
# savedPrVisOut: the name of the file where a previous call to prVis was
# stored.
# area: vector with in the form of [x_start, x_finish, y_start, y_finish].
# x_start, x_finish, y_start, and y_finish should all be between 0
# and 1. These values correspond to percentages of the graph from
# left to right and bottom to top. [0,1,0,1] specifies the entirety
# of the graph. [0,0.5,0.5,1] specifies upper-left quadrant. x_start
# must be less than x_finish and y_start must be less than y_finish
addRowNums <-
function(np=0,area=c(0,1,0,1),savedPrVisOut="lastPrVisOut",cex=0.9)
{
load(savedPrVisOut)
if(is.null(row.names(outputList$prout$x[,1:2])))
row.names(outputList$prout$x) <-
as.character(1:nrow(outputList$prout$x))
pcax <- outputList$prout$x[,1:2]
if(!identical(area, c(0,1,0,1))){
# get boundaries of graph
xMin <- min(outputList$prout$x[,1])
xMax <- max(outputList$prout$x[,1])
yMin <- min(outputList$prout$x[,2])
yMax <- max(outputList$prout$x[,2])
# error checking on inputs
xI <- area[1]
xF <- area[2]
yI <- area[3]
yF <- area[4]
if (xI < 0 | xI > 1 | xF < 0 | xF > 1 | xI > xF)
if (yI < 0 | yI > 1 | yF < 0 | yF > 1 | yI > yF){
stop('invalid area boundaries, 0 < x_start < x_finish < 1, 0 < y_start <
y_finish < 1. area is in the form of c(x_start,x_finish,y_start,
y_finish)')
}
# scale x interval
xI <- (xMax - xMin)*xI + xMin
xF <- (xMax - xMin)*xF + xMin
# scale y interval
yI <- (yMax - yMin)*yI + yMin
yF <- (yMax - yMin)*yF + yMin
# filter to datapoints within specified range
pcax <- pcax[which(pcax[,1] <= xF & pcax[,1] >= xI & pcax[,2] <=
yF & pcax[,2] > yI),]
}
npcax <- nrow(pcax)
tmp <- sample(1:npcax,np,replace=FALSE)
rowNames <- row.names(pcax[tmp,])
print('highlighted rows:')
sorted <- sort(as.numeric(rowNames))
for (i in 1:length(rowNames)) {
rn <- rowNames[i]
print(sorted[i])
coords <- pcax[rn,]
text(coords[1],coords[2],rn, cex, col = "red")
}
}
# colorCode: display color coding for user-specified vairables or expressions.
# Normally called after prVis, produce a new coloring of the same plot produced
# by prVis.
# arguments:
# colName: user can specify the column that he or she wants to produce
# the color on. The column specified must be a continuous one.
# colorVec: a column of continuous data, with the first entry in the
# vector corresponding to the first row in the dataframe
# used in savedPrVisOut. Can be used when wanting to color
# by a column that was not included in the prVis call.
# n: The number of shades used to color code the values of colName.
# n and exps should not both be specified at the same time.
# exps: expressions that create a label column that produces coloring.
# If user specifies colName, he or she cannot provide arguments
# for exps, since they are two ways to produce coloring (should
# be mutually exclusive). User can supply several expressions,
# each one corresponding to a group (a label, a color, a level),
# and concatenating by c(). The expression should be mutually
# exclusive, since a data point cannot be in two colors.
# expression format:
# <exp> ::= <subexpression> [(+|*) <subexpression>]
# <subexpression> ::= columnname relationalOperator value
# Note: * represents logic and, + represents logic or
# savedPrVisOut: the file that stores a prVis object
colorCode <- function(colName="",colorVec=NULL, n=256,exps="",
savedPrVisOut="lastPrVisOut", cex = 0.5)
{
load(savedPrVisOut)
xdata <- outputList$gpOut[,1:length(outputList$colName)]
plotData <- outputList$prout$x[,1:2]
isColorDeclared <- xor(!is.null(colorVec),colName != "")
# cases covered by continColor:
if (isColorDeclared && exps == "")
{
if (!is.null(colorVec)) # colorVec spcified
d <- colorVec
else # colName spcified
{
if (!colName %in% outputList$colName) # colName is not in the dataframe
stop("The column specified is not a continuous one or not found")
else # normal
{
colNum = which(colName == outputList$colName)
d <- xdata[,colNum]
}
}
minVal <- min(d)
maxVal <- max(d)
diffVal <- maxVal - minVal
colorPalette <- rev(rainbow(n,start=0,end=0.7))
colorIndexes <- sapply(d, function(x) ((x - minVal) * n) %/% diffVal)
plot(plotData, col=colorPalette[colorIndexes])
}
else if (is.null(colorVec) && colName == "" && exps != "")
{
xdata <- as.data.frame(xdata)
# create a label column with potentially more than one labels
numberOfRows <- length(outputList$prout$x[,1])
userCol <- rep(NA, numberOfRows) # initialize label column
hasY <- !(is.null(outputList$yname))
if (hasY) #original dataset has a factor column
factorCol <- outputList$yCol
hasLabel <- c() # track rows that has already had a label, check for relable
for (i in 1:length(exps)) {
# delete all white spaces (compress the string)
exp <- gsub(" ", "", exps[i], fixed = TRUE)
labelName <- exp #use the expression as the label name
subExp <- unlist(strsplit(exp, "\\+|\\*"))
for (m in 1:length(subExp))
exp <- sub(subExp[m], "", exp, fixed = T)
exp <- unlist(strsplit(exp, split="")) #string to vector of characters
#number of +/* operators should be one less than the number of constraints
if (length(exp) != length(subExp) - 1)
stop (length(exp)," +/* not match ",length(subExp)," constraints")
for (j in 1:length(subExp)) { # solve all of an expressions constraints
# Ex has one constraint but the relational operator is extracted
# EX : "Male", "1"
Ex <- unlist(strsplit(subExp[j],"(==|>=|<=|>|<|!=)")) #relational ops
if (length(Ex) != 2) # Ex should have two components: column name, value
stop ("The constraint must follow the format: 'yourCol'
'relationalOperator' 'value'")
else {
tmp <- trimws(Ex[1])
columnNum <- grep(tmp, outputList$colName)
if (!length(columnNum) && (!hasY || tmp != outputList$yname))
stop("The specified column ",Ex[1],
" is not found in the data frame xy")
# restore the relational operator
relationalOp <- sub(Ex[1], "", subExp[j], fixed= TRUE)
relationalOp <- sub(Ex[2], "", relationalOp, fixed= TRUE)
if (hasY && tmp == outputList$yname) # Ex[1] is the factorcol
{
if (!Ex[2] %in% levels(factorCol))
stop ("The label ", Ex[2], " is not in the factor column")
# when ecounter operations between labels, only == and != make sense
if (!relationalOp %in% c("==", "!="))
stop ("Use of the inappropriate operator ", relationalOp)
# get the row numbers of data that satisfy the constraint userExp[i]
rowBelong <- switch(relationalOp, "==" = which(factorCol == Ex[2]),
"!=" = which (factorCol != Ex[2]))
}
else { # EX[1] is a continuous column, so Ex[2] should be a number
val <- as.double(Ex[2])
if (is.null(val)|| val< min(xdata[[columnNum]])
||val > max(xdata[[columnNum]]))
stop("The value ", Ex[2], " is out of the range")
# get the row numbers of data that satisfy the constraint userExp[i]
rowBelong <- switch(relationalOp,
"==" = which(xdata[[columnNum]] == val),
"!=" = which (xy[[columnNum]] != val),
">="= which(xdata[[columnNum]]>=val),
"<="=which(xdata[[columnNum]] <= val),
">" = which (xdata[[columnNum]] > val),
"<" = which(xdata[[columnNum]] < val))
}
}
if (j == 1) # initialize labelData
labelData <- rowBelong
else {
if (exp[j-1] == "*") # And, get the intersection of the row numbers
labelData <- intersect(labelData, rowBelong)
else # Or, get the union
labelData <- union(labelData, rowBelong)
}
} # end for loop
# check for overlaps! will cause relabel of certain data that satisfy more
# than two expressions. Enforcing mutual exclusivity between expressions
if (length(intersect(labelData, hasLabel)) != 0)
stop ("The expression ", i, " tries to relabel some data,
the groups must be mutually exclusive")
# gives the label to data that satisfy the expression
userCol[labelData] <- labelName
# update hasLabel to keep track of all data has been labeled
hasLabel <- union(labelData, hasLabel)
} # end big for loop
if (length(hasLabel) == 0) # no matching data of the expressions
stop ("Expression(s) match no data points")
userCol[-hasLabel] <- "others"
userCol <- as.factor(userCol)
plot (plotData, col=userCol, pch=15, cex=cex)
} # end createCol
else
{
if (colName == "" && exps == "" && is.null(colorVec)) # spcified nothing
stop("colName, expressions(exps), or colorVec must be specified")
else
stop ("colorVec, colName and exps should not be specified
at the same time")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.