knitr::opts_chunk$set(echo = T, out.width = '85%', fig.align = "center", warning = F)
Importing needed packages
getwd()
library(ape) library(dendextend) library(cluster) library(tibble) library(magrittr) library(dplyr) library(phytools) library(mltools) library(data.table) library(factoextra) source("../modules/convert_to_parenthesis.R") source("../modules/cv_score.R") library(tictoc) library(mvMORPH) library(dplyr) library(tidyr) library(RColorBrewer) library(MASS) library(kableExtra) library(plyr) library(clValid)
First working with a simulated data set:
set.seed(500) n = 170 p = 2 Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) colnames(sim.data) = c("X1", "X2", "Y") row.names(sim.data) = c(1:nrow(sim.data)) head(sim.data, 4)
We can make a comparison function between a supervised learning prediction score computed into hierarchical clustering with ward.D2 method with our score:
tic("Time of the supervised comparison") comp = supervised_comparing(sim.data, clust = "hclust", clust_method = "ward.D2", test_index = 3) comp toc()
Testing the list input for supervised_comparing and cv_score algorithms:
test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median", "centroid"), agnes = c("weighted", "average", "ward"), diana = NA) names(test.list) test.list[[3]]
paste0(names(test.list)[1], test.list[[1]][1])
Running all supervised comparisons:
tic("Running time for all_supervised_comparing") data.test = all_supervised_comparing(sim.data, clust = test.list, test_index = 3) data.test toc()
Graphing the comparisson:
data.test %>% mutate(names = rownames(data.test)) %>% ggplot(aes(x = V1, y = V3)) + geom_point(color = "red") + labs(x = "Proportion of hits", y = "Scores") + theme_minimal()
\
sim.fom = FOM(sim.data[, -3], nlvls = 2, test.list)
sim.fom
FOM versus score:
data.test$FOM = sim.fom$V1 data.test %>% mutate(names = rownames(data.test)) %>% ggplot(aes(x = FOM, y = V3)) + geom_point(color = "red") + labs(x = "FOM score", y = "Scores") + theme_minimal()
dists = c("euclidean", "manhattan", "canberra") tic("Running time for all_supervised_comparing") data.test = all_supervised_comparing(sim.data, clust = test.list, test_index = 3, dist = dists) data.test toc()
Computing FOM:
sim.fom = FOM(sim.data[, -3], nlvls = 2, test.list, dists = dists)
sim.fom = sim.fom %>% dplyr::arrange(V2) # arranging data.test data.test = data.test %>% dplyr::arrange(V4) sim.fom$V1 = as.numeric(sim.fom$V1) sim.fom
Graphing the comparison acording to distance:
data.test = data.test %>% mutate(V1 = as.numeric(V1), V2 = as.numeric(V2), V3 = as.numeric(V3), V4 = as.factor(V4)) data.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test$FOM = sim.fom$V1 data.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
cor(data.test$FOM, data.test$V3, method = "spearman")
Increasing sample size:
set.seed(500) n = 550 p = 2 Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) colnames(sim.data) = c("X1", "X2", "Y") row.names(sim.data) = c(1:nrow(sim.data)) head(sim.data, 4)
dists = c("euclidean", "manhattan", "canberra") tic("Running time for all_supervised_comparing") data.test2 = all_supervised_comparing(sim.data, clust = test.list, test_index = 3, dist = dists) data.test2 toc()
sim.fom2 = FOM(sim.data[, -3], nlvls = 2, test.list, dists = dists) sim.fom2 = sim.fom2 %>% dplyr::arrange(V2) # arranging data.test data.test2 = data.test %>% dplyr::arrange(V4) sim.fom2$V1 = as.numeric(sim.fom2$V1) sim.fom2
data.test2 = data.test2 %>% mutate(V1 = as.numeric(V1), V2 = as.numeric(V2), V3 = as.numeric(V3), V4 = as.factor(V4)) data.test2 %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test2 %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test2$FOM = sim.fom2$V1 data.test2 %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
cor(data.test2$FOM, data.test2$V3, method = "spearman")
Repeating all the proccess for more separated data:
set.seed(150) n = 170 p = 2 Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) mu_0 = c(0, 0) mu_1 = c(6, 8) S = diag(nrow = 2, ncol = 2) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) colnames(sim.data) = c("X1", "X2", "Y") row.names(sim.data) = c(1:nrow(sim.data)) head(sim.data, 4)
dists = c("euclidean", "manhattan", "canberra") tic("Running time for all_supervised_comparing") data.test = all_supervised_comparing(sim.data, clust = test.list, test_index = 3, dist = dists) data.test toc()
Computing FOM:
sim.fom = FOM(sim.data[, -3], nlvls = 2, test.list, dists = dists)
sim.fom = sim.fom %>% dplyr::arrange(V2) # arranging data.test data.test = data.test %>% dplyr::arrange(V4) sim.fom$V1 = as.numeric(sim.fom$V1) sim.fom
Graphing the comparison acording to distance:
data.test = data.test %>% mutate(V1 = as.numeric(V1), V2 = as.numeric(V2), V3 = as.numeric(V3), V4 = as.factor(V4)) data.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test$FOM = sim.fom$V1 data.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
cor(data.test$FOM, data.test$V3, method = "spearman")
Increasing sample size:
set.seed(500) n = 550 p = 2 Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) mu_0 = c(0, 0) mu_1 = c(6, 8) S = diag(nrow = 2, ncol = 2) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) colnames(sim.data) = c("X1", "X2", "Y") row.names(sim.data) = c(1:nrow(sim.data)) head(sim.data, 4)
dists = c("euclidean", "manhattan", "canberra") tic("Running time for all_supervised_comparing") data.test2 = all_supervised_comparing(sim.data, clust = test.list, test_index = 3, dist = dists) data.test2 toc()
sim.fom2 = FOM(sim.data[, -3], nlvls = 2, test.list, dists = dists) sim.fom2 = sim.fom2 %>% dplyr::arrange(V2) # arranging data.test data.test2 = data.test2 %>% dplyr::arrange(V4) sim.fom2$V1 = as.numeric(sim.fom2$V1) sim.fom2
data.test2 %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
data.test2$FOM = sim.fom2$V1 data.test2 %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
cor(data.test2$FOM, data.test2$V3, method = "spearman")
Testing iris dataset:
flowers = iris tic("Running time for all_supervised_comparing in iris dataset") iris.test = all_supervised_comparing(flowers, clust = test.list, test_index = 5, dist = dists) iris.test toc()
iris.fom = FOM(flowers[, -5], nlvls = 3, test.list, dists = dists)
iris.fom = iris.fom %>% dplyr::arrange(V2) # arranging data.test iris.test = iris.test %>% dplyr::arrange(V4) iris.fom$V1 = as.numeric(iris.fom$V1) iris.fom
And then, graphing it: Proportion of hits versus Score values
iris.test = iris.test %>% mutate(V1 = as.numeric(V1), V2 = as.numeric(V2), V3 = as.numeric(V3), V4 = as.factor(V4)) iris.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))+ scale_colour_brewer(palette = "Set1")
\ F1 versus Score values
iris.test %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
\
iris.test$FOM = iris.fom$V1 iris.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") cor(iris.test$FOM, iris.test$V3, method = "kendall")
Testing supervisioned comparisson for cross validation score
set.seed(500) n = 170 p = 2 Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) colnames(sim.data) = c("X1", "X2", "Y") row.names(sim.data) = c(1:nrow(sim.data)) head(sim.data, 4)
cl.list = list(hclust = c("single","ward.D2", "mcquitty", "average", "centroid", "complete", "ward.D"), agnes = c("weighted", "average"), diana = NA) dists = c("euclidean", "manhattan", "canberra") tic("Running time for all_supervised_comparing") cv.data.test = supervised_comparing_L_cross_val(sim.data, cl.list, test_index = 3, dists = dists) head(cv.data.test, 10) toc()
cv.data.test %>% ggplot(aes(x = V1, y = hits, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Scores", y = "Proportion of hits", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))+ scale_colour_brewer(palette = "Set1") + coord_flip()
cv.data.test %>% ggplot(aes(x = V1, y = F1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Scores", y = "F1", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))+ scale_colour_brewer(palette = "Set1") + coord_flip()
cv.data.test = cv.data.test %>% dplyr::arrange(V2) cv.fom = FOM(sim.data[, -3], nlvls = 2, cl.list, dists = dists) cv.fom$V1 = as.numeric(cv.fom$V1) cv.fom = cv.fom %>% dplyr::arrange(V2) cv.data.test$FOM = cv.fom$V1
cv.data.test %>% ggplot(aes(x = V1, y = FOM, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Scores", y = "FOM score", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))+ scale_colour_brewer(palette = "Set1") + coord_flip()
\ To facilitate further investigations on several datasets, we can draft a function that make pairwise graphs of FOM and our score versus the other variables:
# function for default score: display.superv_comp = function(data, clust.list, test_index, dist = NA){ k = nlevels(data[, test_index]) data.test = all_supervised_comparing(data, clust = clust.list, test_index = test_index, dist = dist) fom.test = FOM(data[, -(test_index)], nlvls = k, clust.list, dist) # arranging datas fom.test = fom.test %>% dplyr::arrange(V2) data.test = data.test %>% dplyr::arrange(V4) fom.test$V1 = as.numeric(fom.test$V1) data.test$FOM = fom.test$V1 # graphing and displaying correlations g1 = data.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g2 = data.test %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g3 = data.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g4 = data.test %>% ggplot(aes(x = V1, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g5 = data.test %>% ggplot(aes(x = V2, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") show(g1) show(g2) show(g3) cat("Correlations between score and other variables: \n") cat("Score versus proportion of hits: ", cor(data.test$V3, data.test$V1, method = "spearman"), "\n") cat("Score versus F1: ", cor(data.test$V3, data.test$V2, method = "spearman"), "\n") cat("Score versus FOM: ", cor(data.test$V3, data.test$FOM, method = "spearman"), "\n") show(g4) show(g5) cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(data.test$FOM, data.test$V2, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(data.test$FOM, data.test$V1, method = "spearman"), "\n") return(data.test) } CV_display.superv_comp = function(data, clust.list, test_index, dist = NA){ k = nlevels(data[, test_index]) data.test = supervised_comparing_L_cross_val(data, clust.list, test_index = test_index, dists = dist) fom.test = FOM(data[, -(test_index)], nlvls = k, clust.list, dist) # arranging datas data.test = data.test %>% dplyr::arrange(V2) fom.test$V1 = as.numeric(fom.test$V1) fom.test = fom.test %>% dplyr::arrange(V2) data.test$FOM = fom.test$V1 # graphing and displaying correlations g1 = data.test %>% ggplot(aes(x = hits, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g2 = data.test %>% ggplot(aes(x = F1, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g3 = data.test %>% ggplot(aes(x = FOM, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g4 = data.test %>% ggplot(aes(x = hits, y = FOM, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g5 = data.test %>% ggplot(aes(x = F1, y = FOM, colour = V2)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") show(g1) show(g2) show(g3) cat("Correlations between cross validated score and other variables: \n") cat("Score versus proportion of hits: ", cor(data.test$V1, data.test$hits), "\n") cat("Score versus F1: ", cor(data.test$V1, data.test$F1), "\n") cat("Score versus FOM: ", cor(data.test$V1, data.test$FOM), "\n") show(g4) show(g5) cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(data.test$FOM, data.test$hits), "\n") cat("FOM versus proportion of hits: ", cor(data.test$FOM, data.test$F1), "\n") return(data.test) } data.generator = function(n, mu_0, mu_1, p = 2, S = NULL, seed = 500){ set.seed(seed) Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) if(is.null(S) == T) S = diag(nrow = p, ncol = p) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) if(p == 2){ colnames(sim.data) = c("X1", "X2", "Y")} row.names(sim.data) = c(1:nrow(sim.data)) return(sim.data) }
Repeating simulation again:
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) head(sim.data, 4)
display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
For separated data:
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(6, 8) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
n = 500 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
flowers = iris display.superv_comp(flowers, test.list, test_index = 5, dist = dists)
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) head(sim.data, 4)
tic("Running time for cv_superv_comp") CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists) toc()
Importing and testing real datasets: \ Wheat seeds dataset
# wheat seeds dataset wheat_data = read.delim("C:/Users/lucru/EstatÃstica_UFSCar/cv_cluster/data/seeds_dataset.txt") wheat_data$X1 = as.factor(wheat_data$X1) head(wheat_data)
tic("Running time for supervisioned comparisson in wheat seeds dataset") display.superv_comp(wheat_data, test.list, test_index = 8, dist = dists) toc()
\ Prima indians diabetes dataset
# prima indians dataset library(data.table) prima_data <- as.data.frame(fread('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')) prima_data$V9 = as.factor(prima_data$V9) head(prima_data)
tic("Running time for supervisioned comparisson in prima indians diabetes dataset") display.superv_comp(prima_data, test.list, test_index = 9, dist = dists) toc()
\ Ionosphere dataset
ionosphere_data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", sep = ",")) ionosphere_data$V35 = as.factor(ionosphere_data$V35) head(ionosphere_data)
tic("Running time for supervisioned comparisson in ionosphere dataset") display.superv_comp(ionosphere_data, test.list, test_index = 35, dist = dists) toc()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.