knitr::opts_chunk$set(echo = T, out.width = '85%', fig.align = "center", warning = F)
Importing needed packages
library(ape) library(dendextend) library(cluster) library(tibble) library(magrittr) library(dplyr) library(phytools) library(mltools) library(data.table) library(factoextra) source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/convert_to_parenthesis.R") source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/cv_score.R") library(tictoc) library(mvMORPH) library(dplyr) library(tidyr) library(RColorBrewer) library(MASS) library(kableExtra) library(plyr) library(clValid)
To facilitate all investigations on several datasets (including simulated ones), we can draft a function that make pairwise graphs of FOM and our score versus the other variables:
# function for default score: display.superv_comp = function(data, clust.list, test_index, scale = F, dist = NA){ k = nlevels(data[, test_index]) data.test = all_supervised_comparing(data, clust = clust.list, test_index = test_index, dist = dist, scale = scale) fom.test = FOM(data[, -(test_index)], nlvls = k, clust.list, dist, scale = scale) # arranging datas fom.test = fom.test %>% dplyr::arrange(V2) data.test = data.test %>% dplyr::arrange(V4) fom.test$V1 = as.numeric(fom.test$V1) data.test$FOM = fom.test$V1 # dropping NA´s from V3 data.test %<>% drop_na(V3) # graphing and displaying correlations g1 = data.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g2 = data.test %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g3 = data.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g4 = data.test %>% ggplot(aes(x = V1, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g5 = data.test %>% ggplot(aes(x = V2, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") show(g1) show(g2) show(g3) cat("Correlations between score and other variables: \n") cat("Score versus proportion of hits: ", cor(data.test$V3, data.test$V1, method = "spearman"), "\n") cat("Score versus F1: ", cor(data.test$V3, data.test$V2, method = "spearman"), "\n") cat("Score versus FOM: ", cor(data.test$V3, data.test$FOM, method = "spearman"), "\n") show(g4) show(g5) cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(data.test$FOM, data.test$V2, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(data.test$FOM, data.test$V1, method = "spearman"), "\n") return(data.test) } CV_display.superv_comp = function(data, clust.list, test_index, dist = NA, scale = F){ k = nlevels(data[, test_index]) data.test = supervised_comparing_L_cross_val(data, clust.list, test_index = test_index, dists = dist, scale = scale) fom.test = FOM(data[, -(test_index)], nlvls = k, clust.list, dist, scale = scale) # arranging datas data.test = data.test %>% dplyr::arrange(V2) fom.test$V1 = as.numeric(fom.test$V1) fom.test = fom.test %>% dplyr::arrange(V2) data.test$FOM = fom.test$V1 # graphing and displaying correlations g1 = data.test %>% ggplot(aes(x = hits, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Cross-Validated Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g2 = data.test %>% ggplot(aes(x = F1, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Cross-Validated Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g3 = data.test %>% ggplot(aes(x = FOM, y = V1, colour = V2)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Cross-Validated Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g4 = data.test %>% ggplot(aes(x = hits, y = FOM, colour = V2)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g5 = data.test %>% ggplot(aes(x = F1, y = FOM, colour = V2)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") show(g1) show(g2) show(g3) cat("Correlations between cross validated score and other variables: \n") cat("Cross validated score versus proportion of hits: ", cor(data.test$V1, data.test$hits, method = "spearman"), "\n") cat("Cross validated score versus F1: ", cor(data.test$V1, data.test$F1, method = "spearman"), "\n") cat("Cross validated score versus FOM: ", cor(data.test$V1, data.test$FOM, method = "spearman"), "\n") show(g4) show(g5) cat("Correlations between FOM and other variables: \n") cat("FOM versus proportion of hits: ", cor(data.test$FOM, data.test$hits, method = "spearman"), "\n") cat("FOM versus F1: ", cor(data.test$FOM, data.test$F1, method = "spearman"), "\n") return(data.test) } display_data.superv_comp = function(data){ # k = nlevels(data[, test_index]) # data.test = all_supervised_comparing(data, clust = clust.list, test_index = test_index, dist = dist) data.test = data # fom.test = FOM(data[, -(test_index)], nlvls = k, clust.list, dist) # arranging datas # fom.test = fom.test %>% # dplyr::arrange(V2) # data.test = data.test %>% # dplyr::arrange(V4) # fom.test$V1 = as.numeric(fom.test$V1) # data.test$FOM = fom.test$V1 # graphing and displaying correlations g1 = data.test %>% ggplot(aes(x = V1, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g2 = data.test %>% ggplot(aes(x = V2, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g3 = data.test %>% ggplot(aes(x = FOM, y = V3, colour = V4)) + geom_point() + theme_minimal() + labs(x = "FOM score", y = "Scores", colour = "Distances", title = "Scatterplot of FOM versus Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g4 = data.test %>% ggplot(aes(x = V1, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "Proportion of hits", y = "Scores", colour = "Distances", title = "Scatterplot of proportion of hits versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") g5 = data.test %>% ggplot(aes(x = V2, y = FOM, colour = V4)) + geom_point() + theme_minimal() + labs(x = "F1 score", y = "Scores", colour = "Distances", title = "Scatterplot of F1 versus FOM Score values according to different distances") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1") show(g1) show(g2) show(g3) cat("Correlations between score and other variables: \n") cat("Score versus proportion of hits: ", cor(data.test$V3, data.test$V1, method = "spearman"), "\n") cat("Score versus F1: ", cor(data.test$V3, data.test$V2, method = "spearman"), "\n") cat("Score versus FOM: ", cor(data.test$V3, data.test$FOM, method = "spearman"), "\n") show(g4) show(g5) cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(data.test$FOM, data.test$V2, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(data.test$FOM, data.test$V1, method = "spearman"), "\n") return(data.test) } data.generator = function(n, mu_0, mu_1, p = 2, S = NULL, seed = 500){ set.seed(seed) Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5)) if(is.null(S) == T) S = diag(nrow = p, ncol = p) X = matrix(nrow = n, ncol = p) X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4) X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4) sim.data = as.data.frame(cbind(X , Y)) sim.data$Y = as.factor(Y) if(p == 2){ colnames(sim.data) = c("X1", "X2", "Y")} row.names(sim.data) = c(1:nrow(sim.data)) return(sim.data) }
Repeating simulation again:
# fixing parameters set.seed(122) n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) head(sim.data, 4)
Defining a test list and distance vector for further supervision comparissons:
# excluding centroid test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median"), agnes = c("weighted", "average", "ward"), diana = NA) dists = c("euclidean", "manhattan", "canberra")
display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
For separated data:
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(6, 8) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
sim.test_1 = display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
n = 500 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
sim.test_2 = display.superv_comp(sim.data, test.list, test_index = 3, dist = dists)
flowers = iris flowers.test = display.superv_comp(flowers, test.list, test_index = 5, dist = dists)
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) head(sim.data, 4)
tic("Running time for cv_superv_comp") cv_sim.test = CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists) toc()
Importing and testing real datasets: \ Prima indians diabetes dataset
n = 350 # prima indians dataset library(data.table) prima_data <- as.data.frame(fread('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')) selected_rows = sample(1:nrow(prima_data), n, replace = F) # selecting rows (too many samples) prima_data$V9 = as.factor(prima_data$V9) prima_data = prima_data[selected_rows, ] row.names(prima_data) = 1:nrow(prima_data) head(prima_data)
tic("Running time for supervisioned comparisson in prima indians diabetes dataset") prima_data.test = display.superv_comp(prima_data, test.list, test_index = 9, dist = dists) toc()
Wheat seeds dataset
# wheat seeds dataset wheat_data = read.delim("C:/Users/lucru/Estatística_UFSCar/cv_cluster/data/seeds_dataset.txt") wheat_data$X1 = as.factor(wheat_data$X1) head(wheat_data)
tic("Running time for supervisioned comparisson in wheat seeds dataset") wheat_data.test = display.superv_comp(wheat_data, test.list, test_index = 8, dist = dists) toc()
\ Ionosphere dataset
ionosphere_data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", sep = ",")) ionosphere_data$V35 = as.factor(ionosphere_data$V35) # droping second variable ionosphere_data = ionosphere_data[, -2] pca_ionosphere = prcomp(ionosphere_data[, -34], center = TRUE,scale. = TRUE) summary(pca_ionosphere) # selecting first 12 components ionosphere_components = as.data.frame((pca_ionosphere$x)[, 1:12]) ionosphere_components$label = ionosphere_data[, 34] head(ionosphere_components)
tic("Running time for supervisioned comparisson in ionosphere dataset") ionosphere_data.test = display.superv_comp(ionosphere_components, test.list, test_index = 13, dist = dists) toc()
\ Glass data
glass.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data", sep = ",")) glass.data = glass.data[, -1] glass.data$V11 = as.factor(glass.data$V11) head(glass.data)
tic("Running time for supervisioned comparisson in glass dataset") glass_data.test = display.superv_comp(glass.data, test.list, test_index = 10, dist = dists) toc()
\ Haberman´s survival
haberman.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data", sep = ",")) haberman.data$V4 = as.factor(haberman.data$V4) head(haberman.data)
tic("Running time for supervisioned comparisson in glass dataset") haberman_data.test = display.superv_comp(haberman.data, test.list, test_index = 4, dist = dists) toc()
\ Wine data
wine.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep = ",")) wine.data$V1 = as.factor(wine.data$V1) head(wine.data)
tic("Running time for supervisioned comparisson in wine dataset") wine_data.test = display.superv_comp(wine.data, test.list, test_index = 1, dist = dists) toc()
Investigating some outliers
hclust.median.canb = hclust(dist(scale(wine.data[, -1]), method = "canberra"), method = "median") fviz_dend(hclust.median.canb)
hclust.single.eucl = hclust(dist(scale(wine.data[, -1])), method = "single") fviz_dend(hclust.single.eucl)
hclust.single.manh = hclust(dist(scale(wine.data[, -1]), method = "manhattan"), method = "single") fviz_dend(hclust.single.manh)
tic("Running time for cv_superv_comp in wheat dataset") cv_wheat.test = CV_display.superv_comp(wheat_data, test.list, test_index = 8, dist = dists) toc()
\ Assessing cross-validated score per variable:
wheat.l_cross_per_var = L_cross_val_per_var(wheat_data[, -8], test.list, dists) head(wheat.l_cross_per_var)
wheat.l_cross_per_var$V8 = as.factor(unlist(wheat.l_cross_per_var$V8)) # removing NA wheat.l_cross_per_var %<>% na.omit() head(wheat.l_cross_per_var)
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) sim.data.l_cross_per_var = L_cross_val_per_var(sim.data[, -3], test.list, dists) head(sim.data.l_cross_per_var)
Preparing plots for each dataset: \ Wheat seeds
ggplot_data = reshape2::melt(wheat.l_cross_per_var[, -8]) ggplot_data$dists = rep(wheat.l_cross_per_var$V8, 7) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, colour = dists)) + geom_point() + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated score values versus variables from wheat seeds dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
\ Simulated data
ggplot_data = reshape2::melt(sim.data.l_cross_per_var[, -3]) ggplot_data$dists = rep(sim.data.l_cross_per_var$V3, 2) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, colour = dists)) + geom_point() + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated score values versus variables from simulated dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
\ Generating normal data with very different variances and equal means:
n = 200 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) S = cbind(c(2, 1), c(1, 16)) sim.data = data.generator(n, mu_0, mu_1, S = S) head(sim.data, 4) sim.data.l_cross_per_var = L_cross_val_per_var(sim.data[, -3], test.list, dists) head(sim.data.l_cross_per_var)
ggplot_data = reshape2::melt(sim.data.l_cross_per_var[, -3]) ggplot_data$dists = rep(sim.data.l_cross_per_var$V3, 2) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, colour = dists)) + geom_point() + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated score values versus variables from simulated dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_colour_brewer(palette = "Set1")
\ Testing score for scaled variables:
wheat.l_cross_per_var_scaled = L_cross_val_per_var(wheat_data[, -8], test.list, dists, scale = T) head(wheat.l_cross_per_var_scaled)
wheat.l_cross_per_var_scaled$V8 = as.factor(unlist(wheat.l_cross_per_var_scaled$V8)) # removing NA wheat.l_cross_per_var_scaled %<>% na.omit()
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S) sim.data.l_cross_per_var_scaled = L_cross_val_per_var(sim.data[, -3], test.list, dists, scale = T) head(sim.data.l_cross_per_var_scaled)
Preparing plots for each dataset: \ Wheat seeds
nb.cols = 33 mycolors = colorRampPalette(brewer.pal(33, "Paired"))(nb.cols) ggplot_data = reshape2::melt(wheat.l_cross_per_var_scaled[, -8]) ggplot_data$dists = rep(wheat.l_cross_per_var_scaled$V8, 7) ggplot_data$obs = rep(as.factor(1:30), 7) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, group = obs)) + geom_line(aes(color = obs)) + geom_point(aes(color = obs)) + scale_colour_manual(values = mycolors) + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated scaled score values versus variables from wheat seeds dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))
\ Simulated data
ggplot_data = reshape2::melt(sim.data.l_cross_per_var_scaled[, -3]) ggplot_data$dists = rep(sim.data.l_cross_per_var_scaled$V3, 2) ggplot_data$obs = rep(as.factor(1:33), 2) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, group = obs)) + geom_line(aes(color = obs)) + geom_point(aes(color = obs)) + scale_colour_manual(values = mycolors) + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated scaled score values versus variables from simulated dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))
n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) S = cbind(c(2, 1), c(1, 12)) sim.data2 = data.generator(n, mu_0, mu_1, S = S) sim.data.l_cross_per_var_scaled2 = L_cross_val_per_var(sim.data2[, -3], test.list, dists, scale = T) head(sim.data.l_cross_per_var_scaled2)
ggplot_data = reshape2::melt(sim.data.l_cross_per_var_scaled2[, -3]) ggplot_data$dists = rep(sim.data.l_cross_per_var_scaled2$V3, 2) ggplot_data$obs = rep(as.factor(1:33), 2) ggplot_data %>% mutate(variable = as.factor(variable), dists = as.factor(dists)) %>% ggplot(aes(x = variable, y = value, group = obs)) + geom_line(aes(color = obs)) + geom_point(aes(color = obs)) + scale_colour_manual(values = mycolors) + theme_minimal() + labs(x = "Variables names", y = "Cross validated score values", colour = "Distances", title = "Scatterplot of cross validated scaled score values versus variables from simulated dataset") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5))
ggplot.data = reshape2::melt(sim.data2[, -3]) ggplot.data$Y = rep(sim.data2$Y, 2) ggplot.data %>% ggplot(aes(x = variable, y = value, fill = Y)) + geom_boxplot()+ labs(x = "Variables names", y = "Values", fill = "Category of Y", title = "Boxplot of X1 and X2") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette = "Set1")
ggplot.data = reshape2::melt(as.data.frame(scale(sim.data2[, -3]))) ggplot.data$Y = rep(sim.data2$Y, 2) ggplot.data %>% ggplot(aes(x = variable, y = value, fill = Y)) + geom_boxplot()+ labs(x = "Variables names", y = "Values", fill = "Category of Y", title = "Boxplot of scaled variables X1 and X2") + theme(text = element_text(size = 11, family ="serif"), plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette = "Set1")
\ Repeating all supervised-comparing for scaled score and cross validated score:
# fixing parameters set.seed(122) n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S, seed = 250) head(sim.data, 4)
Defining a test list and distance vector for further supervision comparissons:
# excluding centroid test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median"), agnes = c("weighted", "average", "ward"), diana = NA) dists = c("euclidean", "manhattan", "canberra")
cv_sim.data = CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists, scale = T) cv_sim.data
Increasing sample size
n = 280 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
cv_sim.test_2 = CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists, scale = T)
flowers = iris cv_flowers.test = CV_display.superv_comp(flowers, test.list, test_index = 5, dist = dists, scale = T)
Importing and testing real datasets: \ Prima indians diabetes dataset
n = 350 # prima indians dataset library(data.table) prima_data <- as.data.frame(fread('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')) selected_rows = sample(1:nrow(prima_data), n, replace = F) # selecting rows (too many samples) prima_data$V9 = as.factor(prima_data$V9) prima_data = prima_data[selected_rows, ] row.names(prima_data) = 1:nrow(prima_data) head(prima_data)
tic("Running time for supervisioned comparisson in prima indians diabetes dataset") cv_prima_data.test = CV_display.superv_comp(prima_data, test.list, test_index = 9, dist = dists, scale = T) toc()
# removing NaN cv_prima_data.test %<>% na.omit(cv_prima_data.test) cat("Correlations between cross validated score and other variables: \n") cat("Cross validated score versus proportion of hits: ", cor(cv_prima_data.test$V1, cv_prima_data.test$hits, method = "spearman"), "\n") cat("Cross validated score versus F1: ", cor(cv_prima_data.test$V1, cv_prima_data.test$F1, method = "spearman"), "\n") cat("Cross validated score versus FOM: ", cor(cv_prima_data.test$V1, cv_prima_data.test$FOM, method = "spearman"), "\n") cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(cv_prima_data.test$FOM, cv_prima_data.test$hits, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(cv_prima_data.test$FOM, cv_prima_data.test$F1, method = "spearman"), "\n")
Wheat seeds dataset
# wheat seeds dataset wheat_data = read.delim("C:/Users/lucru/Estatística_UFSCar/cv_cluster/data/seeds_dataset.txt", header = F) wheat_data$V8 = as.factor(wheat_data$V8) head(wheat_data)
tic("Running time for supervisioned comparisson in wheat seeds dataset") cv_wheat_data.test = CV_display.superv_comp(wheat_data, test.list, test_index = 8, dist = dists, scale = T) toc()
\ Ionosphere dataset
ionosphere_data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", sep = ",")) ionosphere_data$V35 = as.factor(ionosphere_data$V35) # droping second variable ionosphere_data = ionosphere_data[, -2] pca_ionosphere = prcomp(ionosphere_data[, -34], center = TRUE,scale. = TRUE) summary(pca_ionosphere) # selecting first 12 components ionosphere_components = as.data.frame((pca_ionosphere$x)[, 1:12]) ionosphere_components$label = ionosphere_data[, 34] head(ionosphere_components)
tic("Running time for supervisioned comparisson in ionosphere dataset") cv_ionosphere_data.test = CV_display.superv_comp(ionosphere_components, test.list, test_index = 13, dist = dists, scale = T) toc()
\
# removing NaN cv_ionosphere_data.test %<>% na.omit(cv_ionosphere_data.test) cat("Correlations between cross validated score and other variables: \n") cat("Cross validated score versus proportion of hits: ", cor(cv_ionosphere_data.test$V1, cv_ionosphere_data.test$hits, method = "spearman"), "\n") cat("Cross validated score versus F1: ", cor(cv_ionosphere_data.test$V1, cv_ionosphere_data.test$F1, method = "spearman"), "\n") cat("Cross validated score versus FOM: ", cor(cv_ionosphere_data.test$V1, cv_ionosphere_data.test$FOM, method = "spearman"), "\n") cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(cv_ionosphere_data.test$FOM, cv_ionosphere_data.test$hits, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(cv_ionosphere_data.test$FOM, cv_ionosphere_data.test$F1, method = "spearman"), "\n")
Glass data
glass.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data", sep = ",")) glass.data = glass.data[, -1] glass.data$V11 = as.factor(glass.data$V11) head(glass.data)
tic("Running time for supervisioned comparisson in glass dataset") cv_glass_data.test = CV_display.superv_comp(glass.data, test.list, test_index = 10, dist = dists, scale = T) toc()
\ Haberman´s survival
haberman.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data", sep = ",")) haberman.data$V4 = as.factor(haberman.data$V4) head(haberman.data)
tic("Running time for supervisioned comparisson in glass dataset") cv_haberman_data.test = CV_display.superv_comp(haberman.data, test.list, test_index = 4, dist = dists, scale = T) toc()
\ Wine data
wine.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep = ",")) wine.data$V1 = as.factor(wine.data$V1) head(wine.data)
tic("Running time for supervisioned comparisson in wine dataset") cv_wine_data.test = CV_display.superv_comp(wine.data, test.list, test_index = 1, dist = dists) toc()
\ Repeating all supervised-comparing for scaled score and cross validated score, with less combinations:
# fixing parameters set.seed(122) n = 170 p = 2 mu_0 = c(0, 0) mu_1 = c(1, 2) S = cbind(c(2, 1), c(1, 3)) sim.data = data.generator(n, mu_0, mu_1, S = S, seed = 250) head(sim.data, 4)
Defining a test list and distance vector for further supervision comparissons:
# excluding centroid test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median"), agnes = c("weighted"), diana = NA) dists = c("euclidean", "manhattan")
cv_sim.data = CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists, scale = T) cv_sim.data
Increasing sample size
n = 280 p = 2 mu_0 = c(0, 0) mu_1 = c(2, 2) sim.data = data.generator(n, mu_0, mu_1, seed = 150) head(sim.data, 4)
cv_sim.test_2 = CV_display.superv_comp(sim.data, test.list, test_index = 3, dist = dists, scale = T)
flowers = iris cv_flowers.test = CV_display.superv_comp(flowers, test.list, test_index = 5, dist = dists, scale = T)
Importing and testing real datasets: \ Prima indians diabetes dataset
n = 350 # prima indians dataset library(data.table) prima_data <- as.data.frame(fread('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')) selected_rows = sample(1:nrow(prima_data), n, replace = F) # selecting rows (too many samples) prima_data$V9 = as.factor(prima_data$V9) prima_data = prima_data[selected_rows, ] row.names(prima_data) = 1:nrow(prima_data) head(prima_data)
tic("Running time for supervisioned comparisson in prima indians diabetes dataset") cv_prima_data.test = CV_display.superv_comp(prima_data, test.list, test_index = 9, dist = dists, scale = T) toc()
# removing NaN cv_prima_data.test %<>% na.omit(cv_prima_data.test) cat("Correlations between cross validated score and other variables: \n") cat("Cross validated score versus proportion of hits: ", cor(cv_prima_data.test$V1, cv_prima_data.test$hits, method = "spearman"), "\n") cat("Cross validated score versus F1: ", cor(cv_prima_data.test$V1, cv_prima_data.test$F1, method = "spearman"), "\n") cat("Cross validated score versus FOM: ", cor(cv_prima_data.test$V1, cv_prima_data.test$FOM, method = "spearman"), "\n") cat("Correlations between FOM and other variables: \n") cat("FOM versus F1: ", cor(cv_prima_data.test$FOM, cv_prima_data.test$hits, method = "spearman"), "\n") cat("FOM versus proportion of hits: ", cor(cv_prima_data.test$FOM, cv_prima_data.test$F1, method = "spearman"), "\n")
Wheat seeds dataset
# wheat seeds dataset wheat_data = read.delim("C:/Users/lucru/Estatística_UFSCar/cv_cluster/data/seeds_dataset.txt", header = F) wheat_data$V8 = as.factor(wheat_data$V8) head(wheat_data)
tic("Running time for supervisioned comparisson in wheat seeds dataset") cv_wheat_data.test = CV_display.superv_comp(wheat_data, test.list, test_index = 8, dist = dists, scale = T) toc()
\ Ionosphere dataset
ionosphere_data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", sep = ",")) ionosphere_data$V35 = as.factor(ionosphere_data$V35) # droping second variable ionosphere_data = ionosphere_data[, -2] pca_ionosphere = prcomp(ionosphere_data[, -34], center = TRUE,scale. = TRUE) summary(pca_ionosphere) # selecting first 12 components ionosphere_components = as.data.frame((pca_ionosphere$x)[, 1:12]) ionosphere_components$label = ionosphere_data[, 34] head(ionosphere_components)
tic("Running time for supervisioned comparisson in ionosphere dataset") cv_ionosphere_data.test = CV_display.superv_comp(ionosphere_components, test.list, test_index = 13, dist = dists, scale = T) toc()
Glass data
glass.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data", sep = ",")) glass.data = glass.data[, -1] glass.data$V11 = as.factor(glass.data$V11) head(glass.data)
tic("Running time for supervisioned comparisson in glass dataset") cv_glass_data.test = CV_display.superv_comp(glass.data, test.list, test_index = 10, dist = dists, scale = T) toc()
\ Haberman´s survival
haberman.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data", sep = ",")) haberman.data$V4 = as.factor(haberman.data$V4) head(haberman.data)
tic("Running time for supervisioned comparisson in glass dataset") cv_haberman_data.test = CV_display.superv_comp(haberman.data, test.list, test_index = 4, dist = dists, scale = T) toc()
\ Wine data
wine.data = as.data.frame(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep = ",")) wine.data$V1 = as.factor(wine.data$V1) head(wine.data)
tic("Running time for supervisioned comparisson in wine dataset") cv_wine_data.test = CV_display.superv_comp(wine.data, test.list, test_index = 1, dist = dists) toc()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.