# ### Experimento
# Como usar o pacote
# library(MChenINN)
#
# # Step 1
# # Data Preparation - Receive a matrix - last column is the label
# D <- mchendatapreparation(p)
#
# # Step 2
# # Perform the data condensation with chen`s algorithm with entropy function
# chenentropy(D)
#
# # Step 3
# # Show the performance of Entropy Function
# #drawconfusion()
# drawentropyfunction()
#
# Build and Reload Package: 'Ctrl + Shift + B'
# Check Package: 'Ctrl + Shift + E'
# Test Package: 'Ctrl + Shift + T'
mchendatapreparation <- function(p){
D <- length(p$x[1,]) # dimensionality
K <- length(unique(p$classes))# number of classes
N <- length(p$x[,2])/K # number of points per class
data <- data.frame(do.call(cbind, p))
colnames(data) <- c('x','y','label')
x1 <- p$x
Y <- data$label
spiral_features <- p$x
dist_eucl <- function(x1,x2) sqrt(sum((x1 - x2 ) ^ 2))
new_sample <- c(0,0)
distances <- apply(spiral_features, 1, function(x) dist_eucl(x, new_sample))
distances_sorted <- sort(distances, index.return = T)
elementos <- distances
classes <- Y[distances_sorted$ix]
Nd <- length(elementos)
D = data.frame(elementos, classes) # df is a data frame
#if( linear) {
#}
assign("K", K, envir = .GlobalEnv)
assign("N", N, envir = .GlobalEnv)
assign("Nd", Nd, envir = .GlobalEnv)
assign("distances_sorted", distances_sorted, envir = .GlobalEnv)
return(D)
}
mchenconvdisteucltodataorig <- function(condensed_training) {
condensed_training_set_orig <- data.frame()
x <- distances_sorted$x
ind <- vector()
for (i in 1:length(condensed_training[,1])){
ind[i] <- max(distances_sorted$ix[which(x == x[abs(x-condensed_training$elementos[i]) %in% sort(abs(x-condensed_training$elementos[i]))[i+1]])])
}
#max(distances_sorted$ix[which(x == x[abs(x-condensed_training$elementos[2]) %in% sort(abs(x-condensed_training$elementos[2]))[2+1]])])
#x[abs(x-condensed_training$elementos[2]) %in% sort(abs(x-condensed_training$elementos[2]))[2]]
#which(x == x[abs(x-condensed_training$elementos[2]) %in% sort(abs(x-condensed_training$elementos[2]))] )
#0.2129810 0.2716936
p_dataframe <- cbind(unlist(p$x),as.factor(unlist(p$classes)))
x[][1]
p_dataframe_na <- p_dataframe[ind,]
row.has.na <- apply(p_dataframe[ind,], 1, function(x){any(is.na(x))})
p_dataframe_na[!row.has.na,]
condensed_training_set_orig <- p_dataframe_na[!row.has.na,]
colnames(condensed_training_set_orig) <- c('x','y','label')
return(condensed_training_set_orig)
}
mchencondensation <- function(D,ge=-0.1,le=-0.1,standdeviation=0,MM=10,linear=FALSE,npar=TRUE,print=FALSE){
homogeneidade <- function (nc,y_part,setsize) {
g <- rep(nc,length(y_part))
freqs <- table(y_part)/setsize #lenght(g)
hom <- sum(freqs * log2(freqs))
return(hom)
}
#Analise quantitativa
alevel <- vector()
aleveldiv <- 0
alevelcount <- 0
aleveldivcount <- 0
alevelcountNC <- 0
alevelNC <- vector()
timing <- vector()
#condensation_manas_version <- function(Nd,D,npar=TRUE,print=TRUE){
# Auxiliary
#I <- vector()
# Time
start_time <- Sys.time()
finalList <- list()
Divisions <- vector()
entropy_global <- matrix()
entropy_global_sd <- matrix()
di <- 0
# Step 2
# The current number of subsets of the training set
Nc <- 1
# Step 3
# Assume D = training set, C(1) = D and i = 1
i <- 1
C <- list()
C[[i]] <- D
# Step 12
# Continue ateh Nc < Nd
while (Nc < Nd) {
# Alevel
# Step 4
# Find two farthest points P1 and P2 in the set D
P1 <- max(D$elementos, na.rm = TRUE)
P2 <- min(D$elementos, na.rm = TRUE)
# Print Out the divistion (P1+P2)/2
# Step 5
# Divide the set D into two subsets D1 and D2
di <- length(Divisions) + 1
Divisions[di] <- (P1+P2)/2
D1 <- subset(D, elementos < (P1+P2)/2, select = c(elementos,classes))
D2 <- subset(D, elementos > (P1+P2)/2, select = c(elementos,classes))
# Step 6
Nc <- i + 1
C[[i]] <- D1
C[[Nc]] <- D2
# Step 7
# Contains objects from two classes at least
# Calculo aleveldiv
aleveldivcount <- aleveldivcount + 1
if( sapply(C[[i]][2], function(x) homogeneidade(i,x,length(x))) > le ) {
# if( sapply(C[[i]][2], function(x) length(unique(x))) < 2) {
# timing <- toc()
finalList[[length(finalList)+1]] <- C[[i]]
alevelcount <- alevelcount + 1
alevel[alevelcount] <- aleveldivcount
# tic("i")
}
if( sapply(C[[Nc]][2], function(x) homogeneidade(Nc,x,length(x))) > le) {
# if( sapply(C[[Nc]][2], function(x) length(unique(x))) < 2) {
finalList[[length(finalList)+1]] <- C[[Nc]]
alevelcount <- alevelcount + 1
alevel[alevelcount] <- aleveldivcount
}
I <- lapply(C, function(x) sapply(x, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
max <- 0
for (t in 1:length(I)) {
if (I[[t]][1] > max && I[[t]][2] > 0){
max <- I[[t]][1]
list_position <- t
}
}
if (max == 0) {
break
}
D <- C[[list_position]]
C <- C[-list_position]
#Calculo Entropy stop
df <- do.call(rbind.data.frame, C)
entropy_global[i] <- homogeneidade(1,df[,2],length(df))
#if( entropy_global[i] > 3500 ) {
if (Nc > MM) {
#entropy_global_sd[i] <- sd(entropy_global[(Nc-MM):(Nc-2)])
if( sd(entropy_global[(Nc-MM):(Nc-2)]) < standdeviation) {
#if( entropy_global_sd[i] < ge) {
break
}}
# break
#}
i <- length(C) + 1
}
# End time
end_time <- Sys.time()
#condensed_training <- as.data.frame(lapply(finalList, function(x) sapply(x, function(x) median(x))))
condensed_training <- do.call("rbind",lapply(finalList, function(x) sapply(x, function(x) median(x))))
element_after_division <- do.call("rbind", finalList)
rm(i,D,C,max,list_position,I)
#}
#result <- condensation_manas_version(Nd,D)
#View(result)
#c(15,20)
condensed_training <- as.data.frame(condensed_training)
randomSample = function(condensed_training,n) {
return (df[sample(nrow(df), n),])
}
#condensed_training <- condensed_training
condensed_training_to_original_data <- mchenconvdisteucltodataorig(condensed_training)
dataframeList <- list(condensed_training = condensed_training,entropy_global =entropy_global,end_time= end_time,start_time = start_time,standdeviation = standdeviation,Nd = Nd,condensed_training_to_original_data =condensed_training_to_original_data)
mchenassingvariables(dataframeList)
assign("entropy_global_sd", entropy_global_sd, envir = .GlobalEnv)
#return(dataframeList)
}
# Assigning Global Variations
mchenassingvariables <- function(dataList){
condensed_training <- as.data.frame(dataList[['condensed_training']])
entropy_global <- (dataList[['entropy_global']])
end_time <- (dataList[['end_time']])
start_time <- (dataList[['start_time']])
standdeviation <- (dataList[['standdeviation']])
Nd <- (dataList[['Nd']])
condensed_training_to_original_data <- (dataList[['condensed_training_to_original_data']])
assign("condensed_training", condensed_training, envir = .GlobalEnv)
assign("entropy_global", entropy_global, envir = .GlobalEnv)
assign("end_time", end_time, envir = .GlobalEnv)
assign("start_time", start_time, envir = .GlobalEnv)
assign("standdeviation", standdeviation, envir = .GlobalEnv)
assign("Nd", Nd, envir = .GlobalEnv)
assign("condensed_training_to_original_data", condensed_training_to_original_data, envir = .GlobalEnv)
}
mchendrawdistriborig <- function() {
data <- data.frame(do.call(cbind, p))
colnames(data) <- c('x','y','label')
x_min <- min(data[,1])-0.2; x_max <- max(data[,1])+0.2
y_min <- min(data[,2])-0.2; y_max <- max(data[,2])+0.2
# lets visualize the data:
ggplot(data) + geom_point(aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
xlim(x_min, x_max) + ylim(y_min, y_max) +
ggtitle('Data Distribution') +
coord_fixed(ratio = 0.8) +
theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')
}
mchendrawdistricondensed <- function() {
condensed_data <- condensed_training_to_original_data
#names(condensed_data) <- c("x","y","label")
x_min <- min(condensed_data[,1])-0.2; x_max <- max(condensed_data[,1])+0.2
y_min <- min(condensed_data[,2])-0.2; y_max <- max(condensed_data[,2])+0.2
# lets visualize the data:
ggplot(as.data.frame(condensed_data)) + geom_point(aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
xlim(x_min, x_max) + ylim(y_min, y_max) +
ggtitle('Data Distribution') +
coord_fixed(ratio = 0.8) +
theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')
}
# Matrix Confusion generating
mchendrawconfusion <- function(actual_class,predicted_class) {
cm<-caret::confusionMatrix(factor(actual_class),factor(predicted_class))
mchendrawconfusionmatrix <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#3F97D0')
text(195, 435, 'Class1', cex=1.2)
rect(250, 430, 340, 370, col='#F7AD50')
text(295, 435, 'Class2', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#F7AD50')
rect(250, 305, 340, 365, col='#3F97D0')
text(140, 400, 'Class1', cex=1.2, srt=90)
text(140, 335, 'Class2', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
mchendrawconfusionmatrix(cm)
}
# Function Entropy Performance
mchendrawentropyfunction <- function(){
plot(1:length(entropy_global),log(entropy_global)*-1, main="Auto-Generate Protoype Number Fuction",
xlab="Iterações de divisão - Geração de Grupos", ylab="Variabilidade - Entropy Global",type= "l",cex = .6)
legend(length(entropy_global)*.50,(max(log(entropy_global))*-1)+0.9, legend=c("Elapsed Time (Segs)",end_time-start_time),
col=c("red"),cex=1,bty = "n")
legend(length(entropy_global)*.50,(max(log(entropy_global))*-1)+0.7, legend=c("Data Points",Nd),
col=c("red"),cex=1,bty = "n")
legend(length(entropy_global)*.50,(max(log(entropy_global))*-1)+0.5, legend=c("Reduçãoo em %",(1-(length(condensed_training$elementos)/Nd))*100),
col=c("red"),cex=1,bty = "n")
legend(length(entropy_global)*.50,(max(log(entropy_global))*-1)+0.3, legend=c("N Protótipos",length(condensed_training$elementos)),
col=c("red"),cex=1,bty = "n")
legend(length(entropy_global)*.10,(max(log(entropy_global))*-1)+0.9, legend=c("Standard Deviation (SD) ",standdeviation),
col=c("red"),cex=1,bty = "n")
}
mchenknn <- function(){
# Transforming the dependent variable to a factor
data1 <- as.data.frame(condensed_training_to_original_data)
# Transforming the dependent variable to a factor
data1$label = as.factor(data1$label)
#Partitioning the data into training and validation data
set.seed(101)
index = createDataPartition(data1$label, p = 0.7, list = F )
train = data1[index,]
validation = data1[-index,]
# Setting levels for both training and validation data
levels(train$label) <- make.names(levels(factor(train$label)))
levels(validation$label) <- make.names(levels(factor(validation$label)))
# Setting up train controls
repeats = 3
numbers = 10
tunel = 10
set.seed(1234)
x = trainControl(method = "repeatedcv",
number = numbers,
repeats = repeats,
classProbs = TRUE,
summaryFunction = twoClassSummary)
model1 <- train(label~. , data = train, method = "knn",
preProcess = c("center","scale"),
trControl = x,
metric = "ROC",
tuneLength = tunel)
# Validation
# Com os dados reais para validação
validation <- cbind(p$x,p$classes)
validation <- as.data.frame(validation)
names(validation) <- c("x","y","label")
validation$label <- as.factor(validation$label)
valid_pred <- predict(model1,validation, type = "prob")
valid_pred_class <- predict(model1,validation, type = "raw")
valid_pred_class_bin <-mapvalues(valid_pred_class,from=c("X1","X2"),to=c(1,2))
#Storing Model Performance Scores
#library(ROCR)
pred_val <-prediction(valid_pred[,2],validation$label)
# Calculating Area under Curve (AUC)
perf_val <- performance(pred_val,"auc")
#perf_val
assign("pred_val", pred_val, envir = .GlobalEnv)
assign("perf_val", perf_val, envir = .GlobalEnv)
assign("model1", model1, envir = .GlobalEnv)
assign("valid_pred_class", valid_pred_class, envir = .GlobalEnv)
assign("valid_pred", valid_pred, envir = .GlobalEnv)
assign("validation", validation, envir = .GlobalEnv)
assign("valid_pred_class_bin", valid_pred_class_bin, envir = .GlobalEnv)
}
mchendrawaucknn <- function(){
# Plot AUC
perf_val <- performance(pred_val, "tpr", "fpr")
plot(perf_val, col = "green", lwd = 1.5)
assign("perf_val", perf_val, envir = .GlobalEnv)
}
mchendrawksstat <- function() {
#Calculating KS statistics
ks <- max(attr(perf_val, "y.values")[[1]] - (attr(perf_val, "x.values")[[1]]))
ks
}
mchendrawmodelknn <- function(){
# Summary of model
model1
plot(model1)
}
mchencrossvalidation <- function(){
# Em desenvolvimento
# CrossValidation 10-folder
#Randomly shuffle the data
yourData<-condensed_training_to_original_data[sample(nrow(condensed_training_to_original_data)),]
#Create 10 equally size folds
folds <- cut(seq(1,nrow(condensed_training_to_original_data)),breaks=10,labels=FALSE)
#Perform 10 fold cross validation
for(i in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- yourData[testIndexes, ]
trainData <- yourData[-testIndexes, ]
#Use the test and train data partitions however you desire...
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.