trec: Tree Reduced Ensemble Clustering

#context("correctness of TREC")
library(TREC)

test_that("combineClusterings result is exactly the way we construct this algorithm", {
data<-read.table(file = "data_mix_gaussian.txt")
m<-100
n<-3*m
####### Generating clusters via K means with 10 random starts #######
#
#
# 1) k =3
#
#
######

#random starts
nRestarts <- 4
nMethods <- nRestarts
k <- 3

#
#  Get the clusterings from k-means
set.seed(100)
branchComponent <- combineClusterings(kmeans(data,centers = 3),kmeans(data,centers = 4),kmeans(data,centers = 5))
expected <- matrix(data = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
  NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,1,2,2,3,2,4,1,4,1,2,4,3,3,2,2,3,3,3,4,3,1,4,2,2,3,2,2,2,4,1,2,
  3,4,4,1,3,3,3,3,2,2,1,3,2,1,3,1,1,1,3,2,2,3,1,2,2,1,1,2,4,1,1,3,2,1,2,2,4,2,4,2,4,1,3,3,2,1,4,1,1,1,3,2,2,2,4,4,3,3,2,4,3,2,3,4,1,3,4,1,4,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
  NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),nrow = 300, ncol = 2)
colnames(expected) <- paste("Level", 1:ncol(expected))
labels <- paste("object", 1:nrow(expected))
rownames(expected) <- labels
expect_equal(branchComponent$treeMatrix,expected)
####### Generating clusters via K means with 10 random starts #######
#
#
# 2) k = 16
#
#
######

#random starts
nRestarts <- 10
nMethods <- nRestarts
k <- 16
set.seed(1000)

branchComponent <- combineClusterings(kmeans(data,centers = 15),kmeans(data,centers = 16),kmeans(data,centers = 17))
expected<-matrix(data = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,1,4,4,4,5,4,4,5,1,4,4,4,5,1,5,5,4,4,4,4,4,1,5,4,4,4,4,4,4,4,4,4,1,1,4,1,4,4,4,4
,4,4,5,4,4,4,5,4,4,4,4,4,4,4,4,4,5,4,4,1,4,4,4,4,5,4,4,4,4,4,4,5,5,4,4,4,4,4,1,4,4,4,5,5,1,5,4,5,5,4,5,4,4,1,5,4,5,6,6,6,6,6,2,2,3,6,6,6,6,7,6,3,6,6,2,6,2,3,7,2,6,6,6,6,6,6,6,7,6,6,6,6,7,6,6,6,6,6,3,6,7,6,6,2,6,6,6,3,2,6,2,2,6,6,6,6,6,6,6,6,6,6,6,6,2,6,3,2,6,6,2,7,2,6,6,6,7,3,6,2,7,6,6,7,6,3,6,
7,7,6,6,2,6,6,6,6,6,NA,1,2,3,NA,4,5,6,4,7,3,4,8,5,6,3,1,5,4,NA,9,1,2,5,1,8,10,9,11,NA,9,11,12,4,4,2,1,4,4,13,4,8,9,14,1,6,1,2,1,5,14,8,6,5,6,8,4,11,13,9,3,NA,2,NA,1,9,3,6,5,10,NA,6,9,5,2,1,NA,2,9,1,3,13,14,1,8,NA,7,5,5,12,1,9,5,4,NA,5,3,5,8,5,15,16,17,NA,17,18,15,19,15,16,19,NA,18,17,17,20,NA,20,
19,21,22,17,17,16,NA,19,23,17,17,24,17,21,17,18,22,NA,NA,21,NA,17,23,15,21,17,22,20,22,25,22,20,26,17,18,22,17,26,24,24,23,19,22,25,NA,26,15,17,26,19,23,17,16,18,24,21,20,19,25,NA,15,25,25,NA,17,26,23,19,19,NA,20,17,19,20,26,20,17,24,NA,19,22,19,27,28,28,29,27,NA,NA,NA,28,27,30,31,NA,28,NA,32,31,NA,31,NA,NA,NA,NA,31,32,32,29,33,28,32,NA,34,29,27,33,NA,28,27,29,28,29,NA,27,NA,29,34,NA,29,28,27,NA,NA,29,NA,NA,32,27,29,29,29,34,32,27,28,27,29,32,NA,29,NA,NA,29,28,NA,NA,NA,27,28,28,NA,NA,33,NA,NA,32,31,NA,29,NA,31,NA,NA,31,33,NA,31,29,31,30,27),nrow=300,ncol=3)
colnames(expected) <- paste("Level", 1:ncol(expected))
labels <- paste("object", 1:nrow(expected))
rownames(expected) <- labels
expect_equal(branchComponent$treeMatrix,expected)
####### Combining K means (k=2) with Gaussian model based #######
#
######
nMethods <- 2
k <- 3
branchComponent <- combineClusterings(stats::hclust(dist(data),method = "single"),stats::hclust(dist(data),method = "complete"))
expected <- matrix(data = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
                            ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
                            2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
                            ,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
                            2,2,2,2,2,2,2,NA,2,2,2,2,2,3,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,NA,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,1,4,4,4,4,4,4,4,4
                            ,4,1,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,NA,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,4,1,4,6,6,6,6,6,6,6,NA,6,6,6,6,6,6,7,6,6,6,6,6,NA,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,6,6,6,6,6,6,6,6,NA,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
                            6,6,6,6,6,6,6,6,6),nrow = 300,ncol = 3)
colnames(expected) <- paste("Level", 1:ncol(expected))
labels <- paste("object", 1:nrow(expected))
rownames(expected) <- labels
expect_equal(branchComponent$treeMatrix[,1:3],expected)
})

test_that("combineClusterings work for vector or matrix input as well",{

data<-read.table(file = "data_mix_gaussian.txt")
m<-100
n<-3*m
####### test whether matrix input works for combineClusterings ###########
#
# one input is matrix output of single linkage
# the other input is output of complete linkage
nMethods <- 2
clusteringMatrix <- mergeToMatrix(stats::hclust(dist(data),method = "single")$merge)
branchComponent <- combineClusterings(clusteringMatrix,stats::hclust(dist(data),method = "complete"))
expected <- matrix(data = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
                            ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
                            2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
                            ,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
                            2,2,2,2,2,2,2,NA,2,2,2,2,2,3,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,NA,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,1,4,4,4,4,4,4,4,4
                            ,4,1,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,NA,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,4,1,4,6,6,6,6,6,6,6,NA,6,6,6,6,6,6,7,6,6,6,6,6,NA,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,6,6,6,6,6,6,6,6,NA,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
                            6,6,6,6,6,6,6,6,6),nrow = 300,ncol = 3)
colnames(expected) <- paste("Level", 1:ncol(expected))
labels <- paste("object", 1:nrow(expected))
rownames(expected) <- labels
expect_equal(branchComponent$treeMatrix[,1:3],expected)
})


test_that("getClusteringDistance can work, and work correctly", {
data<-read.table(file = "data_mix_gaussian.txt")
m<-100
n<-3*m
####### Generating clusters via K means with 2 random starts #######
#
#
# 1) k =3
#
#
######
  
#random starts
nRestarts <- 2
nMethods <- nRestarts
k <- 3
seeds <- c(220478, 990173)
#
#  Get the clusterings from k-means
set.seed(seeds[1])
kMean1 <- kmeans(data,centers=k,iter.max=15)
set.seed(seeds[2])
kMean2 <- kmeans(data,centers=k+1, iter.max = 15)
distance <- clusDist(kMean1,kMean2)
expect_equal(floor(distance[1]),0)
####### Generating clusters via K means with 10 random starts #######
#
#
# 2) k = 16
#
#
######
  
#random starts
nRestarts <- 2
nMethods <- nRestarts
k <- 16
seeds <- c(220478, 990173)

set.seed(seeds[1])
kMean1 <- kmeans(data,centers=k,iter.max=15)
set.seed(seeds[2])
kMean2 <- kmeans(data,centers=k+1, iter.max = 15)
distance <- clusDist(kMean1,kMean2)
expect_equal(floor(distance[1]),0)
####### Combining K means (k=2) with Gaussian model based #######
#
######
nMethods <- 2
k <- 2
kMean <- kmeans(data,centers=k,iter.max=15)
require(mclust)
mClust <- Mclust(data)

distance <- clusDist(kMean,mClust)
expect_equal(floor(distance[1]),0)
})