Nothing
maxValue <- .Machine$double.xmax # constant
# shift vector circularly for d elements,
# shift left with negative d, shift right with positive d
shift <- function(x, d=1) {
right <- TRUE # positive d means shift right
if (d < 0) {
right <- FALSE
d <- -d
}
d <- d %% length(x)
if (d == 0)
return(x)
if (right) {
start <-x[1:(length(x)-d)]
end <- x[(length(x)-d+1):length(x)]
}
else {
start <-x[1:d]
end <- x[(d+1):length(x)]
}
c(end,start)
}
resample <- function(x, ...) x[sample.int(length(x), ...)]
sqr <- function(x) {
x*x
}
# collect instances with the same position in different components of lst
dataframeFromList<-function(lst){
df <- data.frame(x = rep(NA, length(lst)))
for (j in 1:length(names(lst[[1]]))) {
col <- c()
for (i in 1:length(lst)){
col[i] <-lst[[i]][[j]]
}
df[[j]] <- col
}
#names(df) <- names(lst[[1]])
df
}
# generator for monmk 1,2,3 data sets
monkGen<-function(noInst, problem=1, pYes=0.5, classNoise=0) {
missingYes<-as.integer(noInst*pYes)
missingNo<-noInst-missingYes
m<-data.frame()
while (missingYes>0 || missingNo > 0) {
D1 <- as.integer(runif(noInst,0,3))
D2 <- as.integer(runif(noInst,0,3))
D3 <- as.integer(runif(noInst,0,2))
D4 <- as.integer(runif(noInst,0,3))
D5 <- as.integer(runif(noInst,0,4))
D6 <- as.integer(runif(noInst,0,2))
f<-numeric(noInst)
if (problem==1) {
f[D1==D2 | D5==0] <- 1
}
else if (problem==2) {
f[D1==0] <- f[D1==0]+1
f[D2==0] <- f[D2==0]+1
f[D3==0] <- f[D3==0]+1
f[D4==0] <- f[D4==0]+1
f[D5==0] <- f[D5==0]+1
f[D6==0] <- f[D6==0]+1
f[f!=2] <- 0
f[f==2] <- 1
}
else if (problem==3) {
f[(D5==2 & D4==0) | (D5!=3 & D2!=2)] <- 1
}
else stop("Invalid number of monk problem, should be 1, 2, or 3")
#class noise
if (classNoise > 0) {
ns<-runif(noInst,0,1)
f[ns<classNoise] <- abs(f[ns<classNoise]-1) # revert
}
t <- data.frame(
head_shape=factor(D1),
body_shape=factor(D2),
is_smiling=factor(D3),
holding=factor(D4),
jacket_color=factor(D5),
has_tie=factor(D6),
class=factor(f)
)
cl1=t[t$class==1,]
cl0=t[t$class==0,]
if (nrow(cl1)>=missingYes) {
m<-rbind(m,cl1[seq(1,length.out=missingYes),])
missingYes<-0
}
else {
m<-rbind(m,cl1)
missingYes<-missingYes-nrow(cl1)
}
if (nrow(cl0)>=missingNo) {
m<-rbind(m,cl0[seq(1,length.out=missingNo),])
missingNo<-0
}
else {
m<-rbind(m,cl0)
missingNo<-missingNo-nrow(cl0)
}
}
levels(m$head_shape)<-c("round","square","octagon")
levels(m$body_shape)<-c("round","square","octagon")
levels(m$is_smiling)<-c("yes","no")
levels(m$holding)<-c("sword","baloon","flag")
levels(m$jacket_color)<-c("red","yellow","green","blue")
levels(m$has_tie)<-c("yes","no")
levels(m$class)<-c("no","yes")
monk<-m[sample(noInst),]
monk
}
# the data.frame set is converted to a form such that all the attributes have values between 0 and 1
# this is useful in visualization
varNormalization<-function(md, set){
#d-discrete, a-attribure, n-names
column<-length(set[1,]);
n<-length(set[,1]);
colPos<-matrix(FALSE, column, column);
dan<-md$discAttrNames;
nd<-length(dan);
ian<-0;
if(nd>0){
int<-vector("numeric",nd);
for(ian in 1:nd)
{
search<-dan[ian];
colPos[ian,]<-unlist(lapply(attr(set, 'names'),function(x){x==search}));
int[ian]<-1/(length(levels(set[, colPos[ian, ]]))-1);
}
}
nan<-md$numAttrNames;
nn<-length(nan);
if(nn > 0){
offset<-ian;
mi<-vector("numeric", nn);
sigma<-vector("numeric", nn);
maxnorm<-vector("numeric", nn);
moveup<-vector("numeric", nn);
for(ian in 1:nn){
search<-nan[ian];
tmp<-unlist(lapply(attr(set, 'names'),function(x){x==search}));
colPos[ian+offset,]<-tmp
mi[ian]<-as.numeric(mean(set[,tmp]));
sigma[ian]<-as.numeric(sd(set[,tmp]));
allcurval<-(set[, tmp]-mi[ian])/sigma[ian];
moveup[ian]<-min(c(0,allcurval));
maxnorm[ian]<-max(allcurval-moveup[ian])
}
}
out<-NULL;
classV<- attr(md$terms, "variables")[[2]];
classV<-unlist(lapply(attr(set, 'names'),function(x){x!=classV}));
for(ex in 1:n){
pos<-vector("numeric", column);
if(nd>0){
for(da in 1:nd) {
lev<-levels(set[, colPos[da, ]]);
val<-set[ex, colPos[da, ]];
if(is.na(val)){
index<-1;
}
else{
index<-c(1:length(lev))[lev==val];
}
pos[colPos[da, ]]<-(index-1)*int[da];
}
}
if(nn>0){
for(na in 1:nn){
normal<-((set[ex, colPos[na+offset, ]]-mi[na])/sigma[na]);
val<-(normal-moveup[na])/maxnorm[na];
pos[colPos[na+offset, ]]<-val;
}
}
out<-c(out,pos[classV]);
}
out<-matrix(out, n, column-1, TRUE);
}
plotRFNorm<-function(point, cluster, somnames, lOffset,myHoriz=FALSE, myAxes=FALSE)
{
noVar<-dim(point)[2];
ylim<-c(min(point), max(point)+lOffset)
xlim<-c(1, noVar)
if(is.logical(myAxes)){
axesShow<-TRUE;
}
else{
axesShow<-FALSE;
}
plot(1, 1, xlim=xlim, ylim=ylim, type="n", ann=FALSE, frame=TRUE, axes=axesShow);
if(!is.logical(myAxes) && length(myAxes) > 0){
axis(2);
axis(1, at=1:noVar, labels=myAxes);
}
tmpCluster <- cluster;
clusterLevelNames <- list();
clusterLevels <- 0;
i <- 1;
while(length(tmpCluster) > 0 && i < 13){
clusterLevels[i] <- i;
clusterLevelNames[i]<-tmpCluster[1]
cluster[cluster==tmpCluster[1]]<-i;
tmpCluster <- tmpCluster[tmpCluster!=tmpCluster[1]];
i <- i+1;
}
clusterLevels <- clusterLevels[clusterLevels>0];
myPch <-0;
myColor <-0;
myCount<-7
nexamples<-length(point[,1]);
for(value in 1:nexamples){
#mod
myPch[cluster[value]]<-floor(cluster[value]/myCount);
myColor[cluster[value]]<- 1+cluster[value] - myCount*floor(cluster[value]/myCount);
points(point[value,], col=myColor[cluster[value]], pch=myPch[cluster[value]]);
lines(point[value,], col=myColor[cluster[value]], pch=myPch[cluster[value]]);}
legend(xlim[1], ylim[2], somnames, cex=0.8, col=myColor, pch=myPch, horiz=myHoriz);
}
eps <- 1e-6
#computes cdf from collection of probabilities summing to 1
probs2cdf<-function(probs) {
cdf <- probs
i <- 2
while (i <= length(probs)){
cdf[i] <- cdf[i-1] + cdf[i]
i <- i+1
}
if (abs(cdf[length(cdf)] - 1.0) > eps)
warning("Sum of probability distribution is not equal to 1")
cdf[length(cdf)] <- 1.0
i <- length(cdf) - 1
while (cdf[i] > 1.0) {
cdf[i] <- 1.0
i <- i -1
}
cdf
}
# finds quantile of a factor given by probability distribution
quant<-function(p, probs) {
cdf <- probs2cdf(probs)
val <- findInterval(p, cdf, rightmost.closed=T)
return(val+1)
}
probs2str<-function(x, digits=2) {
s <- "["
for (i in seq(along.with=x)){
s <- paste(s,format(x[i],digits=digits),sep="")
if (i < length(x))
s<-paste(s,", ",sep="")
}
paste(s,"]",sep="")
}
factors2str<-function(x) {
s <- "["
for (i in seq(along.with=x)){
s <- paste(s,format(x[i]),sep="")
if (i < length(x))
s<-paste(s,", ",sep="")
}
paste(s,"]",sep="")
}
intFromProb <- function(probs, n) {
fairN <- probs * n
roundN <- round(fairN)
sumN <- sum(roundN,na.rm=T)
while (sumN > n) {
mostUnfair <- which.is.max(roundN-fairN)
roundN[mostUnfair] <- roundN[mostUnfair] -1
sumN <- sumN - 1
}
while (sumN < n) {
mostUnfair <- which.is.max(fairN-roundN)
roundN[mostUnfair] <- roundN[mostUnfair] + 1
sumN <- sumN + 1
}
roundN
}
#imputation with median value
medianImpute <- function(data) {
for (i in 1:ncol(data)) {
nas <- is.na(data[[i]])
if (any(nas)) {
if (is(data[[i]],"factor"))
data[which(nas), i] <- factorMode(data[[i]])
else
data[which(nas), i] <- median(data[[i]],na.rm=T)
}
}
data
}
# modified median - always returns actual value
mmedian<- function(x) {
sx <- sort(x)
sx[length(sx)/2]
}
# mode - the most frequent value of a factor
factorMode <- function(x) {
freq <- table(x)
f<-names(freq)[which.is.max(freq)]
factor(f, levels=levels(x))
}
normalize01 <- function(data) {
if (ncol(data) > 0) {
for (i in 1:ncol(data)) {
cmin <- min(data[,i], na.rm=TRUE)
cmax <- max(data[,i], na.rm=TRUE)
if (cmin < cmax)
data[,i] <- (data[,i]-cmin)/(cmax-cmin)
else data[,i] <- 0
}
}
data
}
# computes Hellinger distance between two discrete distributions given in the form of probability tables
hellinger <- function(dist1, dist2) {
dist1 <- sqrt(dist1)
dist2 <- sqrt(dist2)
dd <- (dist1-dist2)^2
h <- sqrt(sum(dd))/sqrt(2)
return(h)
}
# a simple toy data generator, producing separated Gaussians, with two or three class values and one to three attributes
toyGauss <- function(noInst=100, noAttr=2, noClassValues=3){
noData <- noInst
if (! noClassValues %in% c(2,3))
stop("Argument noClassValues shall be 2 or 3.")
if (! noAttr %in% 1:3)
stop("Argument noAttr shall be 1, 2, or 3.")
mu1 <- rep(-5,noAttr)
sigma1 <- diag(nrow=noAttr)
attrs1 <- mvrnorm(n=noData,mu=mu1,Sigma=sigma1)
data1<-cbind(attrs1,rep(1,noData)) # attach class value 1
mu2 <- rep(5,noAttr)
sigma2 <- diag(nrow=noAttr)
attrs2 <- mvrnorm(n=noData,mu=mu2,Sigma=sigma2)
data2<-cbind(attrs2,rep(2,noData)) # attach class value 2
mu3 <- rep(0,noAttr)
sigma3 <- diag(nrow=noAttr)
attrs3 <- mvrnorm(n=noData,mu=mu3,Sigma=sigma3)
data3<-cbind(attrs3,rep(3,noData)) # attach class value 3
if (noClassValues==3)
data <- rbind(data1,data2,data3) # merge 3
else if (noClassValues==2)
data <- rbind(data1,data2) # merge 2
data <-as.data.frame(data)
#form attribute names
attrNames <- c("class")
attrNames <- c(paste("A",1:noAttr,sep=""),attrNames)
names(data) <- attrNames
data$"class" <- as.factor(data$"class")
data
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.